# LOGISTIC REGRESSION - TITANIC DATASET
# WESLEY ALDRICH
# 2702363613
# PPTI 17

In [1]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

df = pd.read_csv('titanic.csv')

# remove some columns, fill missing value (except Age)
df_dropped = df.drop(columns=['PassengerId', 'Cabin', 'Name', 'Ticket'])
df_dropped['Embarked'] = df_dropped['Embarked'].fillna('Unknown')

df_dropped.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Sex       891 non-null    object 
 3   Age       714 non-null    float64
 4   SibSp     891 non-null    int64  
 5   Parch     891 non-null    int64  
 6   Fare      891 non-null    float64
 7   Embarked  891 non-null    object 
dtypes: float64(2), int64(4), object(2)
memory usage: 55.8+ KB


In [2]:
import math

def handle_nan(value):
    return 0 if math.isnan(value) else value

In [3]:
def wesley_process(X_train, X_test, y_train, y_test):
  df_train = X_train.copy()
  df_train['Survived'] = y_train

  df_test = X_test.copy()
  df_test['Survived'] = y_test

  train_mean = df_train['Age'].mean()

  # replace the na values for age with the mean of age.
  # note that we should only use the Age mean of df_train
  # since it's illegal to calculate anything based on df_test.
  df_train['Age'] = df_train['Age'].fillna(train_mean)
  df_test['Age'] = df_test['Age'].fillna(train_mean)

  first_survival_rate = df_train[df_train['Pclass'] == 1]['Survived'].mean() * 100
  second_survival_rate = df_train[df_train['Pclass'] == 2]['Survived'].mean() * 100
  third_survival_rate = df_train[df_train['Pclass'] == 3]['Survived'].mean() * 100

  def Pclass(class_num):
      if class_num == 1:
          return 3*handle_nan(first_survival_rate)
      elif class_num == 2:
          return 2*handle_nan(second_survival_rate)
      else:
          return 1*handle_nan(third_survival_rate)

  df_train["Pclass"] = df_train["Pclass"].apply(Pclass)
  df_test["Pclass"] = df_test["Pclass"].apply(Pclass)

  male_survival_rate = df_train[df_train['Sex'] == 'male']['Survived'].mean() * 100
  female_survival_rate = df_train[df_train['Sex'] == 'female']['Survived'].mean() * 100

  def Gender(gender):
      if gender == 'male':
          return handle_nan(male_survival_rate)
      else:
          return handle_nan(female_survival_rate)

  df_train["Sex"] = df_train["Sex"].apply(Gender)
  df_test["Sex"] = df_test["Sex"].apply(Gender)

  S_survival_rate = df_train[df_train['Embarked'] == 'S']['Survived'].mean() * 100
  C_survival_rate = df_train[df_train['Embarked'] == 'C']['Survived'].mean() * 100
  Q_survival_rate = df_train[df_train['Embarked'] == 'Q']['Survived'].mean() * 100
  U_survival_rate = df_train[df_train['Embarked'] == 'Unknown']['Survived'].mean() * 100

  def Embarked(embarked):
      if embarked == 'S':
          return S_survival_rate
      elif embarked == 'C':
          return C_survival_rate
      elif embarked == 'Q':
          return Q_survival_rate
      else:
          return handle_nan(U_survival_rate)

  df_train["Embarked"] = df_train["Embarked"].apply(Embarked)
  df_test["Embarked"] = df_test["Embarked"].apply(Embarked)

  X_train = df_train.drop(columns=['Survived'])
  y_train = df_train['Survived']
  X_test = df_test.drop(columns=['Survived'])
  y_test = df_test['Survived']

  model = LogisticRegression(solver="sag", max_iter = 9999)
  model.fit(X_train, y_train)

  y_pred = model.predict(X_test)

  report = classification_report(y_test, y_pred, output_dict=True)
  f1_avg = report['macro avg']['f1-score']

  return f1_avg

In [4]:
from sklearn.metrics import classification_report
import numpy as np

X = df_dropped.drop(columns=['Survived'])
y = df_dropped['Survived']

# check for a range of random states
random_states = range(0, 128)
f1_avgs = []

for state in random_states:
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=state)

  f1_avg = wesley_process(X_train, X_test, y_train, y_test)

  f1_avgs.append(f1_avg)

min_state = random_states[np.argmin(f1_avgs)]
max_state = random_states[np.argmax(f1_avgs)]

print(f"Minimum f1-score avg: {min(f1_avgs):.4f} at random_state {min_state}")
print(f"Maximum f1-score avg: {max(f1_avgs):.4f} at random_state {max_state}")

Minimum f1-score avg: 0.7181 at random_state 114
Maximum f1-score avg: 0.8494 at random_state 6
