In [53]:
import pandas as pd
import numpy as np 
import math as m 

In [54]:
train_ds = pd.read_csv('train.csv')
test_ds = pd.read_csv('test.csv')

In [55]:
train_ds.info()
test_ds.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass  

In [56]:
train_ds.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [57]:
train_ds.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [58]:
# Fill missing age values with the median age of passengers grouped by Pclass and Sex
train_ds['Age'] = train_ds.groupby(['Pclass', 'Sex'])['Age'].transform(lambda x: x.fillna(x.median()))
test_ds['Age'] = test_ds.groupby(['Pclass', 'Sex'])['Age'].transform(lambda x: x.fillna(x.median()))

In [59]:
train_ds.fillna({'Cabin':'Unknown'},inplace=True)
test_ds.fillna({'Cabin':'Unknown'},inplace=True)


In [60]:
# Fill missing 'Embarked' values with the most frequent value (mode)
train_ds.fillna({'Embarked': train_ds['Embarked'].mode()[0]}, inplace=True)

In [61]:
# fill the missing 'Fare' values in the test dataset with the median fare of passengers grouped by Pclass and Embarked
test_ds['Fare'] = test_ds.groupby(['Pclass', 'Embarked'])['Fare'].transform(lambda x: x.fillna(x.median()))

In [62]:
train_ds.info()
test_ds.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          891 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        891 non-null    object 
 11  Embarked     891 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass  

In [63]:
def cabin_to_num(input : str):
    def name_to_num(name : str):
        if len(name) == 1:
            return (ord(name[0])- ord('A'))* 1000
        else:
            return (ord(name[0])- ord('A'))* 1000 + int(name[1:])
        
    cabins = input.split(' ')
    sum = 0 
    for cabin in cabins:
        if cabin == 'Unknown':
            return -1
        else:
            sum += name_to_num(cabin)
    return sum/len(cabins)

In [64]:
# Convert 'Sex' column to numeric
train_ds['Sex'] = train_ds['Sex'].map({'male': 0, 'female': 1})
test_ds['Sex'] = test_ds['Sex'].map({'male': 0, 'female': 1})

train_ds['Cabin'] = train_ds['Cabin'].apply(cabin_to_num)
test_ds['Cabin'] = test_ds['Cabin'].apply(cabin_to_num)

# Convert 'Embarked' column to numeric
embarked_mapping = {'C': 0, 'Q': 1, 'S': 2}
train_ds['Embarked'] = train_ds['Embarked'].map(embarked_mapping)
test_ds['Embarked'] = test_ds['Embarked'].map(embarked_mapping)

# Since Tickets values have weird patterns, we will drop this column
train_ds.drop('Ticket', axis=1, inplace=True)
test_ds .drop('Ticket', axis=1, inplace=True)

# Drop the 'Name' column from the train and test datasets
train_ds.drop(['Name', 'PassengerId'], axis=1, inplace=True)
test_ds.drop(['Name', 'PassengerId'], axis=1, inplace=True)



# Display the first few rows of the modified train_ds and test_ds
test_ds.head()
train_ds.head()


Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,0,3,0,22.0,1,0,7.25,-1.0,2
1,1,1,1,38.0,1,0,71.2833,2085.0,0
2,1,3,1,26.0,0,0,7.925,-1.0,2
3,1,1,1,35.0,1,0,53.1,2123.0,2
4,0,3,0,35.0,0,0,8.05,-1.0,2


In [65]:
from sklearn.preprocessing import MinMaxScaler

# Select numeric columns
numeric_cols = ['Age', 'Fare', 'Pclass', 'SibSp', 'Parch', 'Cabin','Embarked']

# Initialize the MinMaxScaler
scaler = MinMaxScaler()

# Fit and transform the train dataset
train_ds[numeric_cols] = scaler.fit_transform(train_ds[numeric_cols])

# Transform the test dataset
test_ds[numeric_cols] = scaler.transform(test_ds[numeric_cols])



In [66]:
# Display a random sample of 5 rows from the normalized test_ds
train_ds.sample(5)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
346,1,0.5,1,0.497361,0.0,0.0,0.025374,0.0,1.0
284,0,0.0,0,0.497361,0.0,0.0,0.050749,0.001053,1.0
824,0,1.0,0,0.019854,0.5,0.166667,0.077465,0.0,1.0
3,1,0.0,1,0.434531,0.125,0.0,0.103644,0.111784,1.0
21,1,0.5,0,0.421965,0.0,0.0,0.025374,0.160886,1.0


In [None]:
#extract the frist 600 records for train
x_train = train_ds[:600].drop('Survived', axis=1)
y_train = train_ds[:600]['Survived']

#extract the last 200 records for test
x_test = train_ds[-200:].drop('Survived', axis=1)
y_test = train_ds[-200:]['Survived']

In [67]:
w_1 = np.random.randn(7, 5)
w_1

array([[ 2.04794072, -0.37263031, -0.16974781,  0.93754383, -0.87226782],
       [-0.61928782, -0.35972889,  0.64581368,  0.14920403, -0.25323783],
       [-0.48100446, -0.40889239,  2.16868505, -2.21845875,  0.32704588],
       [ 1.39756755, -1.52748108,  1.04879334,  1.10301532,  2.06988107],
       [ 2.58864054,  1.55188592,  0.10108218, -0.02518194,  0.87221249],
       [ 0.95807967, -0.31063811,  0.10661459,  0.11713914,  1.23398616],
       [-0.0038549 , -0.9689682 , -1.10565896,  0.99868989,  1.68997989]])