In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/titanic/train.csv
/kaggle/input/titanic/test.csv
/kaggle/input/titanic/gender_submission.csv


In [2]:
!pip install pytorch-tabnet

Collecting pytorch-tabnet
  Downloading pytorch_tabnet-4.1.0-py3-none-any.whl.metadata (15 kB)
Downloading pytorch_tabnet-4.1.0-py3-none-any.whl (44 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.5/44.5 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pytorch-tabnet
Successfully installed pytorch-tabnet-4.1.0


In [3]:
import torch
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset
from pytorch_tabnet.tab_model import TabNetClassifier


# Load the CSV file
train_data = pd.read_csv("/kaggle/input/titanic/train.csv")
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
sum(train_data.duplicated())

0

In [5]:
train_data.drop_duplicates(inplace=True)

In [6]:
train_data.nunique()

PassengerId    891
Survived         2
Pclass           3
Name           891
Sex              2
Age             88
SibSp            7
Parch            7
Ticket         681
Fare           248
Cabin          147
Embarked         3
dtype: int64

In [7]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [8]:
train_data.describe(include=['O'])

Unnamed: 0,Name,Sex,Ticket,Cabin,Embarked
count,891,891,891,204,889
unique,891,2,681,147,3
top,"Braund, Mr. Owen Harris",male,347082,B96 B98,S
freq,1,577,7,4,644


In [9]:
unique_labels = train_data["Sex"].explode().unique()
print(unique_labels)

['male' 'female']


In [10]:
from sklearn.impute import SimpleImputer

# Extract the 'Cabin' column
cabin_column = train_data[['Cabin']]
embarked_column = train_data[['Embarked']]


# Use SimpleImputer to fill in NaN values in the 'Cabin' column
imputer = SimpleImputer(strategy='most_frequent')
cabin_column_imputed = imputer.fit_transform(cabin_column)
embarked_column_imputed = imputer.fit_transform(embarked_column)

# Flatten the 2D array to a 1D array using ravel()
cabin_column_imputed = cabin_column_imputed.ravel()
train_data['Embarked']=embarked_column_imputed.ravel()

# Convert the NumPy array to a pandas Series
cabin_column_imputed_series = pd.Series(cabin_column_imputed)

# Extract the cabin letter
train_data['CabinLetter'] = cabin_column_imputed_series.astype(str).str[0]

# age related median transform
age_column = train_data[['Age']]
imputer = SimpleImputer(strategy='median')
train_data[['Age']]=imputer.fit_transform(age_column)

fare_column = train_data[['Fare']]
train_data[['Fare']]=imputer.fit_transform(fare_column)

# Define your age bins and labels
age_bins = [0, 18, 35, 50, 100]  # You can adjust these bins as needed
age_labels = ['0-18', '19-35', '36-50', '51+']

# Create a new categorical column 'AgeCategory' based on the bins and labels
train_data['AgeCategory'] = pd.cut(train_data['Age'], bins=age_bins, labels=age_labels, include_lowest=True)

# Drop the original 'Age' column if you no longer want to use it in the analysis
train_data = train_data.drop('Age', axis=1)


# Map the cabin letter to ordinal categories
cabin_mapping = {'A': 1, 'B': 2, 'C': 3, 'D': 4, 'E': 5, 'F': 6, 'G': 7, 'T': 8}  # Add more categories as needed
embarked_mapping = {'S': 1, 'C': 2, 'Q': 3}
gender_mapping = {'male': 1, 'female': 2}
train_data['CabinOrdinal'] = train_data['CabinLetter'].map(cabin_mapping)
train_data['SexOrdinal'] = train_data['Sex'].map(gender_mapping)
train_data['Cabin']=train_data['CabinOrdinal']
train_data['EmbarkedOrdinal'] = train_data['Embarked'].map(embarked_mapping)
train_data['Embarked']=train_data['EmbarkedOrdinal']
train_data['Sex']=train_data['SexOrdinal']
# Display the result
#print(train_data[['Cabin', 'CabinLetter', 'CabinOrdinal', 'EmbarkedOrdinal']])
print(train_data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 16 columns):
 #   Column           Non-Null Count  Dtype   
---  ------           --------------  -----   
 0   PassengerId      891 non-null    int64   
 1   Survived         891 non-null    int64   
 2   Pclass           891 non-null    int64   
 3   Name             891 non-null    object  
 4   Sex              891 non-null    int64   
 5   SibSp            891 non-null    int64   
 6   Parch            891 non-null    int64   
 7   Ticket           891 non-null    object  
 8   Fare             891 non-null    float64 
 9   Cabin            891 non-null    int64   
 10  Embarked         891 non-null    int64   
 11  CabinLetter      891 non-null    object  
 12  AgeCategory      891 non-null    category
 13  CabinOrdinal     891 non-null    int64   
 14  SexOrdinal       891 non-null    int64   
 15  EmbarkedOrdinal  891 non-null    int64   
dtypes: category(1), float64(1), int64(11), objec

In [11]:
from sklearn.preprocessing import StandardScaler
train_data=train_data.drop(columns=['EmbarkedOrdinal'])
train_data=train_data.drop(columns=['SexOrdinal'])
train_data=train_data.drop(columns=['Ticket'])
train_data=train_data.drop(columns=['Name'])
train_data=train_data.drop(columns=['CabinOrdinal'])
train_data=train_data.drop(columns=['CabinLetter'])
columns_to_scale = ['Pclass', 'SibSp', 'Parch', 'Fare', 'Cabin', 'Embarked']

# Initialize the StandardScaler
scaler = StandardScaler()

# Scale the selected columns
train_data[columns_to_scale] = scaler.fit_transform(train_data[columns_to_scale])

In [12]:
test_data = pd.read_csv("/kaggle/input/titanic/test.csv")
test_data.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [13]:
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,SibSp,Parch,Fare,Cabin,Embarked,AgeCategory
0,1,0,0.827377,1,0.432793,-0.473674,-0.502445,-0.348001,-0.568837,19-35
1,2,1,-1.566107,2,0.432793,-0.473674,0.786845,0.743791,1.005181,36-50
2,3,1,0.827377,2,-0.474545,-0.473674,-0.488854,-0.348001,-0.568837,19-35
3,4,1,-1.566107,2,0.432793,-0.473674,0.42073,0.743791,-0.568837,19-35
4,5,0,0.827377,1,-0.474545,-0.473674,-0.486337,-0.348001,-0.568837,19-35


import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
sns.heatmap(train_data.corr(), cmap='coolwarm',annot=True)
plt.show()

import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
sns.heatmap(train_data.corr(), cmap='coolwarm',annot=True)
plt.show()

sns.set_style('whitegrid')
sns.pairplot(train_data)
plt.show()

In [14]:
from sklearn.impute import SimpleImputer

# Extract the 'Cabin' column
cabin_column = test_data[['Cabin']]
embarked_column = test_data[['Embarked']]

# Use SimpleImputer to fill in NaN values in the 'Cabin' column
imputer = SimpleImputer(strategy='most_frequent')
cabin_column_imputed = imputer.fit_transform(cabin_column)
embarked_column_imputed = imputer.fit_transform(embarked_column)

# Flatten the 2D array to a 1D array using ravel()
cabin_column_imputed = cabin_column_imputed.ravel()
test_data['Embarked']=embarked_column_imputed.ravel()
# Convert the NumPy array to a pandas Series
cabin_column_imputed_series = pd.Series(cabin_column_imputed)

# Extract the cabin letter
test_data['CabinLetter'] = cabin_column_imputed_series.astype(str).str[0]

# age related median transform
age_column = test_data[['Age']]
fare_column = test_data[['Fare']]
imputer = SimpleImputer(strategy='median')
test_data[['Age']]=imputer.fit_transform(age_column)
test_data[['Fare']]=imputer.fit_transform(fare_column)



# Define your age bins and labels
age_bins = [0, 18, 35, 50, 100]  # You can adjust these bins as needed
age_labels = ['0-18', '19-35', '36-50', '51+']

# Create a new categorical column 'AgeCategory' based on the bins and labels
test_data['AgeCategory'] = pd.cut(test_data['Age'], bins=age_bins, labels=age_labels, include_lowest=True)

# Drop the original 'Age' column if you no longer want to use it in the analysis
test_data = test_data.drop('Age', axis=1)


# Map the cabin letter to ordinal categories
cabin_mapping = {'A': 1, 'B': 2, 'C': 3, 'D': 4, 'E': 5, 'F': 6, 'G': 7, 'T': 8}  # Add more categories as needed
embarked_mapping = {'S': 1, 'C': 2, 'Q': 3}
gender_mapping = {'male': 1, 'female': 2}
test_data['CabinOrdinal'] = test_data['CabinLetter'].map(cabin_mapping)
test_data['SexOrdinal'] = test_data['Sex'].map(gender_mapping)
test_data['Cabin']=test_data['CabinOrdinal']
test_data['EmbarkedOrdinal'] = test_data['Embarked'].map(embarked_mapping)
test_data['Embarked']=test_data['EmbarkedOrdinal']
test_data['Sex']=test_data['SexOrdinal']
# Display the result
print(test_data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 15 columns):
 #   Column           Non-Null Count  Dtype   
---  ------           --------------  -----   
 0   PassengerId      418 non-null    int64   
 1   Pclass           418 non-null    int64   
 2   Name             418 non-null    object  
 3   Sex              418 non-null    int64   
 4   SibSp            418 non-null    int64   
 5   Parch            418 non-null    int64   
 6   Ticket           418 non-null    object  
 7   Fare             418 non-null    float64 
 8   Cabin            418 non-null    int64   
 9   Embarked         418 non-null    int64   
 10  CabinLetter      418 non-null    object  
 11  AgeCategory      418 non-null    category
 12  CabinOrdinal     418 non-null    int64   
 13  SexOrdinal       418 non-null    int64   
 14  EmbarkedOrdinal  418 non-null    int64   
dtypes: category(1), float64(1), int64(10), object(3)
memory usage: 46.5+ KB
None


In [15]:
test_data=test_data.drop(columns=['EmbarkedOrdinal'])
test_data=test_data.drop(columns=['SexOrdinal'])
test_data=test_data.drop(columns=['Ticket'])
test_data=test_data.drop(columns=['Name'])
test_data=test_data.drop(columns=['CabinOrdinal'])
test_data=test_data.drop(columns=['CabinLetter'])
columns_to_scale = ['Pclass', 'SibSp', 'Parch', 'Fare', 'Cabin', 'Embarked']

# Initialize the StandardScaler
scaler = StandardScaler()

# Scale the selected columns
test_data[columns_to_scale] = scaler.fit_transform(test_data[columns_to_scale])


In [16]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   PassengerId  891 non-null    int64   
 1   Survived     891 non-null    int64   
 2   Pclass       891 non-null    float64 
 3   Sex          891 non-null    int64   
 4   SibSp        891 non-null    float64 
 5   Parch        891 non-null    float64 
 6   Fare         891 non-null    float64 
 7   Cabin        891 non-null    float64 
 8   Embarked     891 non-null    float64 
 9   AgeCategory  891 non-null    category
dtypes: category(1), float64(6), int64(3)
memory usage: 63.8 KB


In [17]:
columns_with_nan = train_data.columns[train_data.isnull().any()]

# Display columns with NaN values
print("Columns with NaN values:", columns_with_nan)

Columns with NaN values: Index([], dtype='object')


women = train_data.loc[train_data.Sex == 2]["Survived"]
rate_women = sum(women)/len(women)

print("% of women who survived:", rate_women)

men = train_data.loc[train_data.Sex == 1]["Survived"]
rate_men = sum(men)/len(men)

print("% of men who survived:", rate_men)

In [18]:
# Handle missing values
train_data = train_data.dropna()
test_data = test_data.dropna()

# Convert categorical variables to one-hot encoding
train_data = pd.get_dummies(train_data, columns=["Sex", "Embarked", "AgeCategory"])
test_data = pd.get_dummies(test_data, columns=["Sex", "Embarked", "AgeCategory"])

# Alternatively, you can explicitly replace True and False with 1 and 0
train_data = train_data.replace({True: 1, False: 0})

test_data = test_data.replace({True: 1, False: 0})

# Select columns with non-numeric data types
non_numeric_columns = train_data.select_dtypes(exclude=[np.number]).columns
print(non_numeric_columns)
print (train_data[non_numeric_columns])

non_numeric_columnsTest = test_data.select_dtypes(exclude=[np.number]).columns
print(non_numeric_columnsTest)
print (test_data[non_numeric_columnsTest])


# Check for non-numeric values
non_numeric_values = train_data[train_data[non_numeric_columns].apply(lambda x: x.str.isnumeric()).any(axis=1)]

# Check for non-numeric values
non_numeric_valuesTest = test_data[test_data[non_numeric_columnsTest].apply(lambda x: x.str.isnumeric()).any(axis=1)]


# Print rows with non-numeric values
print("Rows with non-numeric values:")
print(non_numeric_values)
print(non_numeric_valuesTest)



Index([], dtype='object')
Empty DataFrame
Columns: []
Index: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, ...]

[891 rows x 0 columns]
Index([], dtype='object')
Empty DataFrame
Columns: []
Index: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, ...]

[418 rows x 0 columns]
Rows with non-numeric values:
Empty Da

  train_data = train_data.replace({True: 1, False: 0})
  test_data = test_data.replace({True: 1, False: 0})


In [19]:
# Convert boolean values to integers (0s and 1s)
train_data = train_data.astype(int)
test_data = test_data.astype(int)


# Alternatively, you can explicitly replace True and False with 1 and 0
train_data = train_data.replace({True: 1, False: 0})

test_data = test_data.replace({True: 1, False: 0})

# Check for non-numeric values
non_numeric_columns = train_data.select_dtypes(exclude=["float64", "int64"]).columns
if not non_numeric_columns.empty:
    raise ValueError("Non-numeric data found in columns: {}".format(non_numeric_columns))

# Define features and target variable
X = train_data.drop(columns=["Survived", "PassengerId"])
y = train_data["Survived"]
xTest = test_data.drop(columns=["PassengerId"])


# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert the data to PyTorch tensors
X_trainNp= X_train.to_numpy()
y_trainNp     = y_train.to_numpy().reshape(-1, 1)
y_trainNp = y_trainNp.flatten()
X_testNp = X_test.to_numpy()
X_testNPO = xTest.to_numpy()

# Initialize the TabNet classifier
model = TabNetClassifier()

# Fit the model
model.fit(X_train=X_trainNp, y_train=y_trainNp, batch_size=32,max_epochs=10)

# Predict on the test data
predictions = model.predict(X_testNp)
outputpredictions = model.predict(X_testNPO)

# Evaluate the model (optional)
accuracy = (predictions == y_test.values).mean()
print("Accuracy:", accuracy)







epoch 0  | loss: 0.72335 |  0:00:00s
epoch 1  | loss: 0.57321 |  0:00:00s
epoch 2  | loss: 0.56018 |  0:00:01s
epoch 3  | loss: 0.55875 |  0:00:01s
epoch 4  | loss: 0.51186 |  0:00:01s
epoch 5  | loss: 0.50057 |  0:00:01s
epoch 6  | loss: 0.50813 |  0:00:02s
epoch 7  | loss: 0.49736 |  0:00:02s
epoch 8  | loss: 0.48958 |  0:00:02s
epoch 9  | loss: 0.48582 |  0:00:03s
Accuracy: 0.7821229050279329


In [20]:

# Create a DataFrame with the average predictions
output = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Survived': outputpredictions})
output.to_csv('submission.csv', index=False)
print("Your submission was successfully saved!")

Your submission was successfully saved!


In [21]:
import pandas as pd

# Load the CSV files into Pandas DataFrames
df1 = pd.read_csv('submission.csv')
#df2 = pd.read_csv('submission1.csv')

# Compare the two DataFrames and find the differences
#differences = df1.compare(df2)

# Print the differences
print("Differences between the two CSV files:")
#print(differences)

Differences between the two CSV files:
