In [2]:
import pandas as pd

from sklearn.tree import DecisionTreeClassifier
from sklearn.multioutput import MultiOutputRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier

import numpy as np

from sklearn.metrics import accuracy_score

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score


In [3]:
def read_file_to_dataframe(file_address):

    
    # Read the CSV file into a DataFrame
    data = pd.read_csv(file_path)
    
    #Drop usless column
    
    data.drop(columns=['Name'], inplace=True)
    
    #---------------------------------------------------------------------------------
    #filling Blank with 0 in all columns
    data.fillna(0, inplace=True)
    
    #---------------------------------------------------------------------------------
    # Sex, Pclass,SibSp,Parch,Cabin , Emarked are alphanumeric values whihc can become categories as well.
    #given below code will convert these columns into catoegores with True False numeric values so that our model can easily take this data for decision
    
    #encoded_df = pd.get_dummies(data, columns=['Sex','Pclass', 'SibSp','Parch','Cabin','Embarked'])
    encoded_df = pd.get_dummies(data, columns=['Sex','Pclass', 'SibSp'])
    
    
    #---------------------------------------------------------------------------------
    #Ticket No field contains many Alphbets,since this can not be treated as category thereofore need to write some custom logic to replace alphabets with some numbers
    
    # Define a mapping for each letter from 'A' to 'Z' to its numeric value
    letter_mapping = {chr(ord('A') + i): str(i + 1) for i in range(26)}
    letter_small_mapping={chr(ord('a') + i): str(i + 27) for i in range(26)}
    letter_mapping[' ']='100'
    letter_mapping['.']='101'
    letter_mapping[',']='102'
    letter_mapping['\\']='103'
    letter_mapping['/']='104'
    letter_mapping['<']='105'
    letter_mapping[',']='106'
    letter_mapping['>']='107'
    letter_mapping['.']='108'
    letter_mapping['?']='109'
    letter_mapping[':']='110'
    letter_mapping[';']='111'
    letter_mapping['"']='112'
    letter_mapping['\'']='113'
    letter_mapping['|']='114'
    
    # Create a combined translation table using the both mapping
    combined_mapping = {**letter_mapping, **letter_small_mapping}
    
    translation_table = str.maketrans(combined_mapping)
    
    # Replace letters with their numeric values using the translation table
    encoded_df['Ticket'] = encoded_df['Ticket'].str.translate(translation_table)


     # Replace letters with their numeric values using the translation table
    encoded_df['Cabin'] = encoded_df['Cabin'].str.translate(translation_table)

     # Replace letters with their numeric values using the translation table
    encoded_df['Embarked'] = encoded_df['Embarked'].str.translate(translation_table)

     # Replace letters with their numeric values using the translation table
  #  encoded_df['Parch'] = encoded_df['Parch'].str.translate(translation_table)

    
    #exporting the data after manipulation
    encoded_df.to_csv('data_after_applying_encoding.csv', index=False)
    
    # Check for duplicate rows
    
    duplicate_rows_df1 = encoded_df.duplicated()
    
    # Count the number of duplicate rows
    num_duplicate_rows = sum(duplicate_rows_df1)
    
    # Remove duplicate rows
    df = encoded_df[~duplicate_rows_df1]

    df.fillna(0, inplace=True)
    
    return df




In [4]:
# importing CSV with training data
file_path = '/kaggle/input/titanic-survival-completion-dataset/train.csv'
df=read_file_to_dataframe(file_path)
#defining input and output dataframes
X=df.copy()

X.drop(columns=['Survived'],inplace=True)

y=df[['Survived']]

In [5]:
#splitting the data into training data and test data ( aorund 20% of data will be used for testing the results whereas 80% for training the model)

#X_train contains training data
# y_train cotains result set for training data  ( both X_train and y_train will train the model)
# X_test contains the test data
# y_test contains test expected Results
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [6]:
X_train.shape

(712, 19)

In [7]:
#---------------------Random Classifier Model ----------------------------------------
model =  RandomForestClassifier(n_estimators=100, random_state=42)  #147 out of 178: Accuracy: 0.82%

#...................................................................................................


#--------------------------------------------------Decision Tree Model-------------------------------
#model=DecisionTreeClassifier()   #140 record matched out of 178 (Mean Absolute Error (MAE): 0.21787709497206703) : Accuracy:  0.79%
#...................................................................................................

#---------------------------------MultiOutputRegressorModel----------------------------------------------
#model = MultiOutputRegressor(DecisionTreeRegressor())   # Not recommended as it requires 2 d array for Output variable ( Y)
#.......................................................................................................

#-------------------------------------------------------------------------------------
# Define the Nural Netwrok model

#model = Sequential()
#model.add(Dense(64, input_dim=20, activation='relu'))  # Input layer with 64 neurons and ReLU activation
#model.add(Dense(32, activation='relu'))  # Hidden layer with 32 neurons and ReLU activation
#model.add(Dense(1, activation='sigmoid'))  # Output layer with 1 neuron (binary classification) and sigmoid activation

# Compile the model
#model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

#-------------------------------------------------------------------------------------

In [8]:
# Reshape y_train to a 1D array
y_train = np.ravel(y_train)

#training the model
model.fit(X_train,y_train)


# Train the Neural Network model
#model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2)

In [9]:
import numpy as np

# Predicting the values on the basis of sample data provided through train.csv
prediction=model.predict(X_test)

mae = mean_absolute_error(y_test, prediction)
mse = mean_squared_error(y_test, prediction)
rmse = mean_squared_error(y_test, prediction, squared=False)  # Compute RMSE from MSE
r2 = r2_score(y_test, prediction)

print("Mean Absolute Error (MAE):", mae)
print("Mean Squared Error (MSE):", mse)
print("Root Mean Squared Error (RMSE):", rmse)
print("R-squared (R²) Score:", r2)


# Evaluate the performance of the classifier
accuracy = accuracy_score(y_test, prediction)
print("Accuracy:", accuracy)


Mean Absolute Error (MAE): 0.15083798882681565
Mean Squared Error (MSE): 0.15083798882681565
Root Mean Squared Error (RMSE): 0.38837866680189276
R-squared (R²) Score: 0.3519710378117459
Accuracy: 0.8491620111731844


In [10]:
#Copying the Result into a file 
# Create a DataFrame for input data
input_df = pd.DataFrame(X_test, columns=X_test.columns.tolist())  # Replace with appropriate column names

# Create a DataFrame for output data
output_df = pd.DataFrame(y_test, columns=['Survived'])  # Replace with appropriate column name
output_df.rename(columns={'Survived': 'Survived_test'}, inplace=True)

# Create a DataFrame for prediction results
prediction_df = pd.DataFrame(prediction, columns=['Survived'])  # Assuming prediction is a 1D array

prediction_df.rename(columns={'Survived': 'Survived_Predicted'}, inplace=True)

input_df.to_csv('/kaggle/working/input_df_results.csv', index=False)
output_df.to_csv('/kaggle/working/output_df_results.csv', index=False)
prediction_df.to_csv('/kaggle/working/prediction_df_results.csv', index=False)


df_input_csv= pd.read_csv('/kaggle/working/input_df_results.csv')
df_output_csv= pd.read_csv('/kaggle/working/output_df_results.csv')


df_prediction_csv= pd.read_csv('/kaggle/working/prediction_df_results.csv')


#-------------------------------------------------------------------------
#this is the final CSV whihc contains Input Data i.e. called X (from Train.csv), Output Data i.e. called Y ( from Train.csv) and actual prediction


# Concatenate input, output, and prediction DataFrames along the columns axis
combined_df = pd.concat([df_input_csv, df_output_csv,df_prediction_csv], axis=1)
combined_df.to_csv('/kaggle/working/model_results.csv', index=False)

In [11]:
#producing Results for Submission to Kagal on Kagal provided files

# importing CSV with training data
file_path = '/kaggle/input/titanic/test.csv'
df22=read_file_to_dataframe(file_path)

df22.to_csv('/kaggle/working/Data_before_prediction.csv', index=False)

#defining input and output dataframes
Xa=df22.copy()


prediction_submission=model.predict(Xa)

# Create a DataFrame for prediction results
prediction_df_Prediction = pd.DataFrame(prediction_submission, columns=['Survived_Predicted'])  # Assuming prediction is a 1D array

prediction_df_Prediction.to_csv('/kaggle/working/output_df_results.csv', index=False)


# Create a DataFrame for input data
input_df = pd.DataFrame(Xa, columns=Xa.columns.tolist())  # Replace with appropriate column names

input_df.to_csv('/kaggle/working/input_df_results.csv', index=False)


df_input_csv_fin= pd.read_csv('/kaggle/working/input_df_results.csv')
df_output_csv_fin= pd.read_csv('/kaggle/working/output_df_results.csv')


df_input_csv_fin=df_input_csv_fin[['PassengerId']]

# Concatenate input, output, and prediction DataFrames along the columns axis
combined_df_fin = pd.concat([df_input_csv_fin, df_output_csv_fin], axis=1)

#Final Results of Titanic Survival prediction
combined_df_fin.to_csv('/kaggle/working/Predicted_Survivals_from_TestData.csv', index=False)
