# Trying to fit a linear model with pytorch

## Preparing data first in the most direct way

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/titanic/train.csv
/kaggle/input/titanic/test.csv
/kaggle/input/titanic/gender_submission.csv


In [2]:
from matplotlib import pyplot as plt

df_train = pd.read_csv('/kaggle/input/titanic/train.csv')
df_test = pd.read_csv('/kaggle/input/titanic/test.csv')
df_all = pd.concat([df_train, df_test])
df_train.name = 'Training Set'
df_test.name = 'Test Set'
df_all.name = 'All Set'
df_all.sample(5)


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
59,951,,1,"Chaudanson, Miss. Victorine",female,36.0,0,0,PC 17608,262.375,B61,C
574,575,0.0,3,"Rush, Mr. Alfred George John",male,16.0,0,0,A/4. 20589,8.05,,S
858,859,1.0,3,"Baclini, Mrs. Solomon (Latifa Qurban)",female,24.0,0,3,2666,19.2583,,C
148,149,0.0,2,"Navratil, Mr. Michel (""Louis M Hoffman"")",male,36.5,0,2,230080,26.0,F2,S
32,924,,3,"Dean, Mrs. Bertram (Eva Georgetta Light)",female,33.0,1,2,C.A. 2315,20.575,,S


In [3]:
def prepare_features(data):
  # missing values
  data.loc[data['Age'].isna(), 'Age'] = data['Age'].mode()[0].item()
  data.loc[data['Embarked'].isna(), 'Embarked'] = data['Embarked'].mode().item()
  data['Fare'].fillna(data['Fare'].mode().item())
  # Cabin: no need as we do delete it
  # categorical
  data = pd.get_dummies(data, columns = ['Sex', 'Pclass', 'Embarked'])
  # useless columns
  data = data.drop(['Cabin', 'Name', 'Ticket', 'PassengerId'], axis = 1)
  # numerical
  data['Age'] /= data['Age'].max() 
  data['SibSp'] /= data['SibSp'].max()
  data['Parch'] /= data['Parch'].max()
  data['Fare'] = np.log1p(data['Fare'])
  data['Fare'] /= data['Fare'].max()
  return data

In [4]:
def prepare_features_initial(data):
  # missing values
  f_age_mean = data.loc[(~data['Age'].isna()) & (data['Sex'].str.contains("female")), 'Age'].mean()
  data.loc[(data['Age'].isna()) & (data['Sex'].str.contains("female")), 'Age'] = f_age_mean  
  m_age_mean = data.loc[(~data['Age'].isna()) & (data['Sex'].str.contains("male")), 'Age'].mean()
  data.loc[(data['Age'].isna()) & (data['Sex'].str.contains("male")), 'Age'] = m_age_mean  
  data['Age'] = data['Age'].astype(float)
  
  data.loc[data['PassengerId'] == 62, 'Embarked'] = 'S'
  data.loc[data['PassengerId'] == 830, 'Embarked'] = 'S'
  
  r = data.loc[(data['Pclass'] == 3) & (data['Embarked'].str.contains('S')) & (data['SibSp'] == 0) & (data['Parch'] == 0), 'Fare'].mode().item()
  data.loc[data['PassengerId'] == 1044, 'Fare'] = r
  # Cabin: no need as we do delete it
  # categorical
  data = pd.get_dummies(data, columns = ['Sex', 'Pclass', 'Embarked'])
  # useless columns
  data = data.drop(['Cabin', 'Name', 'Ticket', 'PassengerId'], axis = 1)
  # numerical
  data['Age'] /= data['Age'].max() 
  data['SibSp'] /= data['SibSp'].max()
  data['Parch'] /= data['Parch'].max()
  data['Fare'] = np.log1p(data['Fare'])
  data['Fare'] /= data['Fare'].max()
  return data

In [5]:
def prepare_features_medium(data):
  # missing values
  f_age_mean = data.loc[(~data['Age'].isna()) & (data['Sex'].str.contains("female")), 'Age'].mean()
  data.loc[(data['Age'].isna()) & (data['Sex'].str.contains("female")), 'Age'] = f_age_mean  
  m_age_mean = data.loc[(~data['Age'].isna()) & (data['Sex'].str.contains("male")), 'Age'].mean()
  data.loc[(data['Age'].isna()) & (data['Sex'].str.contains("male")), 'Age'] = m_age_mean  
  data['Age'] = data['Age'].astype(float)
  
  data.loc[data['PassengerId'] == 62, 'Embarked'] = 'S'
  data.loc[data['PassengerId'] == 830, 'Embarked'] = 'S'
  
  r = data.loc[(data['Pclass'] == 3) & (data['Embarked'].str.contains('S')) & (data['SibSp'] == 0) & (data['Parch'] == 0), 'Fare'].mode().item()
  data.loc[data['PassengerId'] == 1044, 'Fare'] = r
  # Cabin: no need as we do delete it

  # we combine SibSp and Parch to measure the family size including the passenger
  data['Family'] = 1 + data['SibSp'] + data['Parch']
  # categorical
  data = pd.get_dummies(data, columns = ['Sex', 'Pclass', 'Embarked'])
  # useless columns
  data = data.drop(['Cabin', 'Name', 'Ticket', 'PassengerId', 'SibSp', 'Parch'], axis = 1)
  # numerical
  data['Age'] /= data['Age'].max() 
  data['Family'] /= data['Family'].max()
  data['Fare'] = np.log1p(data['Fare'])
  data['Fare'] /= data['Fare'].max()
  return data

In [6]:
df_all = prepare_features_medium(df_all)

In [7]:
df_train = df_all.iloc[:891]
df_train_y = df_train['Survived']
df_train_x = df_train.drop(['Survived'], axis=1)

df_test = df_all.iloc[891:]
df_train.tail(5)

Unnamed: 0,Survived,Age,Fare,Family,Sex_female,Sex_male,Pclass_1,Pclass_2,Pclass_3,Embarked_C,Embarked_Q,Embarked_S
886,0.0,0.3375,0.422864,0.090909,0,1,0,1,0,0,0,1
887,1.0,0.2375,0.550238,0.090909,1,0,1,0,0,0,0,1
888,0.0,0.358589,0.512205,0.363636,1,0,0,0,1,0,0,1
889,1.0,0.325,0.550238,0.090909,0,1,1,0,0,1,0,0
890,0.0,0.4,0.347554,0.090909,0,1,0,0,1,0,1,0


In [8]:
df_test.head()

Unnamed: 0,Survived,Age,Fare,Family,Sex_female,Sex_male,Pclass_1,Pclass_2,Pclass_3,Embarked_C,Embarked_Q,Embarked_S
0,,0.43125,0.348997,0.090909,0,1,0,0,1,0,1,0
1,,0.5875,0.333195,0.181818,1,0,0,0,1,0,0,1
2,,0.775,0.379604,0.090909,0,1,0,1,0,0,1,0
3,,0.3375,0.363449,0.090909,0,1,0,0,1,0,0,1
4,,0.275,0.414494,0.272727,1,0,0,0,1,0,0,1


## Now fitting a linear model 

In [9]:
import torch

# create tensors input, weights, bias
# compute accuracy
# define loss function

train = torch.tensor(df_train_x.to_numpy(), dtype=torch.float32)
labels = torch.tensor(df_train_y.to_numpy(), dtype=torch.float32)
# no bias because we do hot encoding rather than dummy 
train.shape



torch.Size([891, 11])

In [25]:
def compute_preds(coeffs, x):
    return (x * coeffs).sum(axis = 1)

In [26]:
def compute_loss(coeffs, x, y):
    #preds = compute_preds(coeffs, x)
    #return torch.where(y == 1.0, 1.0 - preds, preds).sum()
    return torch.abs(compute_preds(coeffs, x) - y).mean()

In [27]:
def run_epoch(weights, x, y, rate):
    loss = compute_loss(weights, x, y)
    loss.backward()
    with torch.no_grad():
        weights.sub_(weights.grad * rate)
        weights.grad.zero_()
    print(f"{loss:.3f}", end="; ")

In [45]:
def run_model(epochs=10, lr=0.01):
    weights = torch.rand(11)-0.5
    weights.requires_grad_()
    for e in range(epochs):
            run_epoch(weights, train, labels, lr)
    return weights

In [46]:
def accuracy(coeffs, x, y):
    with torch.no_grad():
        preds = compute_preds(coeffs, x)
    return ((preds > 0.5) == y.bool()).float().mean()

In [47]:
parameters = run_model(20, 0.2)

0.842; 0.695; 0.593; 0.533; 0.490; 0.450; 0.420; 0.395; 0.373; 0.353; 0.338; 0.328; 0.321; 0.318; 0.325; 0.330; 0.306; 0.319; 0.310; 0.322; 

In [48]:
accuracy(parameters, train, labels)

tensor(0.7834)

In [49]:
with torch.no_grad():
    result = compute_preds(parameters, df_test.drop(['Survived'], axis=1))
result

0      0.159618
1      0.546358
2      0.293369
3     -0.049422
4      0.455499
         ...   
413   -0.044302
414    1.020222
415   -0.021514
416   -0.044302
417   -0.123936
Length: 418, dtype: float64

In [50]:
df_submit = df_test
results = pd.DataFrame(results)
results.columns = ['Survived']
results.head()

Unnamed: 0,Survived
0,0.0
1,0.0
2,0.0
3,0.0
4,0.0


In [20]:
df_submit = df_submit.fillna(results)
df_submit.head()

Unnamed: 0,Survived,Age,Fare,Family,Sex_female,Sex_male,Pclass_1,Pclass_2,Pclass_3,Embarked_C,Embarked_Q,Embarked_S
0,0.0,0.43125,0.348997,0.090909,0,1,0,0,1,0,1,0
1,0.0,0.5875,0.333195,0.181818,1,0,0,0,1,0,0,1
2,0.0,0.775,0.379604,0.090909,0,1,0,1,0,0,1,0
3,0.0,0.3375,0.363449,0.090909,0,1,0,0,1,0,0,1
4,0.0,0.275,0.414494,0.272727,1,0,0,0,1,0,0,1


In [21]:
df_submit = df_submit.drop(['Age', 'Family', 'Fare', 'Sex_female', 'Sex_male', 'Pclass_1', 'Pclass_2', 'Pclass_3', 'Embarked_C', 'Embarked_Q', 'Embarked_S'], axis=1)
df_submit['Survived'] = df_submit['Survived'].astype('int')
df_submit['PassengerId'] = (892 + df_submit.index) 
df_submit.head()

Unnamed: 0,Survived,PassengerId
0,0,892
1,0,893
2,0,894
3,0,895
4,0,896


In [22]:
df_submit.to_csv('/kaggle/working/20230525_titanic_linear.csv', columns=['PassengerId', 'Survived'], header=True, index=False, sep=',')

In [23]:
!tail /kaggle/working/20230525_titanic_linear.csv
!tail /kaggle/input/titanic/gender_submission.csv

1300,0
1301,0
1302,0
1303,1
1304,0
1305,0
1306,1
1307,0
1308,0
1309,0
1300,1
1301,1
1302,1
1303,1
1304,1
1305,0
1306,1
1307,0
1308,0
1309,0


conclusion: **scored 0.61483** worst than GB and SVC from last year and **0.69617** with pytorch.where() loss function instead of MAE

Improvement: added better missing values gives a score of **0.61722** and **0.69617** with pytorch.where() 