# Test-2 by Varij Dave (100855095)

In [10]:
#import libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.metrics import accuracy_score, f1_score

In [11]:
# Load the train.csv dataset
url = "https://raw.githubusercontent.com/varijdave/AI-Algorithm/main/train.csv"
df = pd.read_csv(url)

1. Drop the following attributes: Name, Ticket, PassengerId

In [12]:
# Drop the specified attributes
df.drop(['Name', 'Ticket', 'PassengerId'], axis=1, inplace=True)

2. Check for missing values and:

a. drop the whole column if it is missing more than 70% of its values,

In [13]:
# Drop columns with more than 70% missing values
df.dropna(thresh=len(df) * 0.7, axis=1, inplace=True)

b. fill in the missing values with the mean of the attribute if it is a numerical attribute,

In [14]:
# Fill missing values in numerical columns with mean
num_cols = df.select_dtypes(include='number').columns
imputer = SimpleImputer(strategy='mean')
df[num_cols] = imputer.fit_transform(df[num_cols])

c. fill in the missing values with the mode of the attribute if it is categorical.

In [15]:
# Fill missing values in categorical columns with mode
cat_cols = df.select_dtypes(include='object').columns
mode_imputer = SimpleImputer(strategy='most_frequent')
df[cat_cols] = mode_imputer.fit_transform(df[cat_cols])

3. One hot encode Sex and Embarked attributes using get_dummies.

In [16]:
# One hot encode Sex and Embarked attributes
df = pd.get_dummies(df, columns=['Sex', 'Embarked'])

4. Split the dataset into 60% for training, 20% for validation, and 20% for testing. Print the .shape for the created X_train, y_train, X_validation, y_validation, X_test, y_test.

In [17]:
X = df.drop('Survived', axis=1)
y = df['Survived']

In [18]:
#Split the dataset
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)
X_validation, X_test, y_validation, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

In [19]:
# Print shapes of the created datasets
print("X_train shape: ", X_train.shape)
print("y_train shape: ", y_train.shape)
print("X_validation shape: ", X_validation.shape)
print("y_validation shape: ", y_validation.shape)
print("X_test shape: ", X_test.shape)
print("y_test shape: ", y_test.shape)

X_train shape:  (534, 10)
y_train shape:  (534,)
X_validation shape:  (178, 10)
y_validation shape:  (178,)
X_test shape:  (179, 10)
y_test shape:  (179,)


5. Train the following classifiers to predict if a passenger will survive:
a. Logistic Regression
b. Decision Tree
c. Gaussian Naïve Bayes
d. Support Vector Machine
e. Random Forrest
f. Gradient Boosting
g. Ada Boost

In [20]:
# Train classifiers
classifiers = {
    'Logistic Regression': LogisticRegression(),
    'Decision Tree': DecisionTreeClassifier(),
    'Gaussian Naive Bayes': GaussianNB(),
    'Support Vector Machine': SVC(),
    'Random Forest': RandomForestClassifier(),
    'Gradient Boosting': GradientBoostingClassifier(),
    'Ada Boost': AdaBoostClassifier()
}

for classifier_name, classifier in classifiers.items():
    classifier.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


6. Report on the .score(X_validation, y_validation) of each model.

In [21]:
#Evaluate models - accuracy score
accuracy_scores = {}
for classifier_name, classifier in classifiers.items():
    accuracy_scores[classifier_name] = classifier.score(X_validation, y_validation)

7. Report on the f1_score(y_validation, y_predicted validation) of each model.

In [22]:
# Evaluate models - f1 score
f1_scores = {}
for classifier_name, classifier in classifiers.items():
    y_pred = classifier.predict(X_validation)
    f1_scores[classifier_name] = f1_score(y_validation, y_pred)

8. Among the 7 models you trained, which one performed best? Limit your answer to 3 lines in a markdown cell.

In [23]:
# Identify best performing model
best_model = max(accuracy_scores, key=accuracy_scores.get)

9. Test the best performing model and report on its .score(X_test, y_test) and f1_score(y_test, y_predicted_test).

In [24]:
#Test the best performing model
best_model_classifier = classifiers[best_model]
best_model_classifier.fit(X_train, y_train)
y_pred_test = best_model_classifier.predict(X_test)
test_accuracy = best_model_classifier.score(X_test, y_test)
test_f1_score = f1_score(y_test, y_pred_test)
# Print results
print("Accuracy scores on validation set: ", accuracy_scores)
print("F1 scores on validation set: ", f1_scores)
print("Best performing model: ", best_model)
print("Accuracy score on test set: ", test_accuracy)
print("F1 score on test set: ", test_f1_score)

Accuracy scores on validation set:  {'Logistic Regression': 0.7640449438202247, 'Decision Tree': 0.702247191011236, 'Gaussian Naive Bayes': 0.7471910112359551, 'Support Vector Machine': 0.6853932584269663, 'Random Forest': 0.7752808988764045, 'Gradient Boosting': 0.8033707865168539, 'Ada Boost': 0.8033707865168539}
F1 scores on validation set:  {'Logistic Regression': 0.6666666666666666, 'Decision Tree': 0.5954198473282443, 'Gaussian Naive Bayes': 0.6666666666666666, 'Support Vector Machine': 0.36363636363636365, 'Random Forest': 0.6774193548387097, 'Gradient Boosting': 0.7008547008547009, 'Ada Boost': 0.7199999999999999}
Best performing model:  Gradient Boosting
Accuracy score on test set:  0.8100558659217877
F1 score on test set:  0.7499999999999999


10. Load the Titanic test.csv file

In [18]:
#Load test.csv
url_test = 'https://raw.githubusercontent.com/varijdave/AI-Algorithm/main/test.csv'
df_test = pd.read_csv(url_test)

11. Preprocess the test.csv data the same way you preprocessed the training dataset

In [19]:
# Drop unnecessary columns
df_test = df_test.drop(['Name', 'Ticket', 'PassengerId', 'Cabin'], axis=1)

In [20]:
# Fill missing values with mean and mode
df_test['Age'].fillna(df['Age'].mean(), inplace=True)
df_test['Fare'].fillna(df['Fare'].mean(), inplace=True)

In [21]:
# One-hot encode categorical columns
df_test_encoded = pd.get_dummies(df_test, columns=['Sex', 'Embarked'])

12. Use the same 7 models you used earlier to predict the number of survived passengers in the test.csv dataset.

In [22]:
#Use trained models to predict survival in test.csv
predictions = {}
for classifier_name, classifier in classifiers.items():
    predictions[classifier_name] = sum(classifier.predict(df_test_encoded))

13. Comparing the performance of all the models you have trained, how many passengers from the test set (test.cvs) do you think actually survived? Limit your answer to 3 lines in a markdown cell.

In [23]:
# Take majority vote from all models' predictions
total_passengers = df_test.shape[0]
survived_passengers = max(predictions.values())

In [24]:
# Print results
print("Predicted number of passengers survived in test.csv: ", survived_passengers, " out of ", total_passengers)

Predicted number of passengers survived in test.csv:  179.0  out of  418


# Do I think they survived?
The final count of survivors among passengers in the test.csv dataset cannot be determined for certain, as it is contingent upon the combined predictions of various trained models. This number is derived from the collective vote of all the models, and unfortunately, there is no "Survived" column in the test.csv dataset to confirm the actual survival of passengers.