<a href="https://colab.research.google.com/github/thedenaas/jub_ml/blob/main/fall_2022/seminar_1/seminar1_solved.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!gdown 16r8U9ouAazF5FbNOgan__jsFwMq-tKlb # train

Downloading...
From: https://drive.google.com/uc?id=16r8U9ouAazF5FbNOgan__jsFwMq-tKlb
To: /content/train.csv
  0% 0.00/61.2k [00:00<?, ?B/s]100% 61.2k/61.2k [00:00<00:00, 57.0MB/s]


In [2]:
import pandas as pd
import numpy as np

In [3]:
data = pd.read_csv('train.csv')
data = data[['Survived', 'Sex', 'Age', 'Pclass', 'Fare', 'SibSp', 'Parch']]

In [4]:
from sklearn.model_selection import train_test_split

## preprocessing and data spliting

###1. Find out which features in data have nans and replace them with mean values for each sex. 

In [5]:
data.isna().describe()

Unnamed: 0,Survived,Sex,Age,Pclass,Fare,SibSp,Parch
count,891,891,891,891,891,891,891
unique,1,1,2,1,1,1,1
top,False,False,False,False,False,False,False
freq,891,891,714,891,891,891,891


In [6]:
mean_age = data.groupby('Sex')['Age'].mean().to_dict()
data.loc[data.Age.isna(), 'Age'] = data.loc[data.Age.isna(), 'Sex'].apply(lambda x: mean_age[x]) 

In [7]:
data

Unnamed: 0,Survived,Sex,Age,Pclass,Fare,SibSp,Parch
0,0,male,22.000000,3,7.2500,1,0
1,1,female,38.000000,1,71.2833,1,0
2,1,female,26.000000,3,7.9250,0,0
3,1,female,35.000000,1,53.1000,1,0
4,0,male,35.000000,3,8.0500,0,0
...,...,...,...,...,...,...,...
886,0,male,27.000000,2,13.0000,0,0
887,1,female,19.000000,1,30.0000,0,0
888,0,female,27.915709,3,23.4500,1,2
889,1,male,26.000000,1,30.0000,0,0


### 2. Replace all Sex and Pclass features with their one-hot encoding

In [8]:
dum1 = pd.get_dummies(data['Pclass'], prefix='Pclass')
dum2 = pd.get_dummies(data['Sex'])

In [9]:
data = pd.concat((data, dum1, dum2), axis=1)

In [10]:
data = data.drop(['Sex', 'Pclass'], axis=1)

### 3. Split data into train/val/test with the proportion 0.6/0.2/0.2. Extract tragets

In [11]:
train, test = train_test_split(data, test_size=0.4)
val, test = train_test_split(test, test_size=0.5)

In [12]:
train_y = train.pop('Survived')
val_y = val.pop('Survived')
test_y = test.pop('Survived')

In [13]:
data_splits = {'train': (train, train_y), 'val': (val, val_y), 'test': (test, test_y)}

### 4. Fit a StandardScaler on numeric train features and transform all. Transform all numeric features with it

In [14]:
from sklearn.preprocessing import StandardScaler

In [15]:
scaler = StandardScaler().fit(X=train[['Fare', 'Age', 'SibSp', 'Parch']])

In [16]:
train[['Fare', 'Age', 'SibSp', 'Parch']] = scaler.transform(train[['Fare', 'Age', 'SibSp', 'Parch']])
test[['Fare', 'Age', 'SibSp', 'Parch']] = scaler.transform(test[['Fare', 'Age', 'SibSp', 'Parch']])
val[['Fare', 'Age', 'SibSp', 'Parch']] = scaler.transform(val[['Fare', 'Age', 'SibSp', 'Parch']])

## Model training

### 5. Train a LinearRegression on the train data and evaluate its perfomance on train/val/test. Use accuracy as a metric

In [17]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score, mean_squared_error

In [18]:
def accuracy(model, split_name):
    return accuracy_score(data_splits[split_name][1], model.predict(data_splits[split_name][0]) > 0.5)
def mse(model, split_name):
    return mean_squared_error(data_splits[split_name][1], model.predict(data_splits[split_name][0]))

In [19]:
def test_metric(model, metric):
    for name, data in data_splits.items():
        print(f'{name} {metric.__name__} = {metric(model, name):.4f}')

In [None]:
model1 = LinearRegression().fit(train, train_y)

In [None]:
test_metric(model1, accuracy)

train accuracy = 0.8015
val accuracy = 0.8315
test accuracy = 0.7654


### 6. Train a RidgeRegression on the train data and find best alpha on the val data. Evaluate performance of the best model on the test data

In [None]:
from sklearn.linear_model import Ridge
from tqdm.notebook import tqdm
from sklearn.metrics import roc_auc_score

In [None]:
def auc(model, split_name):
    return roc_auc_score(data_splits[split_name][1], model.predict(data_splits[split_name][0]))

In [None]:
accs = []
accs_t = []
for alpha in tqdm(np.linspace(0.01, 100, 1000)):
    model_alpha = Ridge(alpha=alpha).fit(train, train_y)
    accs.append(auc(model_alpha, 'val'))
    accs_t.append(auc(model_alpha, 'test'))

  0%|          | 0/1000 [00:00<?, ?it/s]

In [None]:
best_alpha = np.linspace(0.01, 100, 1000)[np.argmax(accs)]
model3 = Ridge(alpha=best_alpha).fit(train, train_y)
auc(model3, 'test')

0.7991436100131752

### 7. Now we will try cross-validation.

In [None]:
train2   = pd.concat((train, val))
train_y2 = pd.concat((train_y, val_y))

In [None]:
from sklearn.model_selection import cross_validate, cross_val_score
from sklearn.metrics import make_scorer

In [None]:
auc_scorer = make_scorer(roc_auc_score)

In [None]:
accs = []
for alpha in tqdm(np.linspace(0.01, 100, 1000)):
    model_cv = Ridge(alpha=alpha)
    preds = cross_val_score(model_cv, train2, train_y2, cv=10, scoring=auc_scorer)
    accs.append(preds.mean())

  0%|          | 0/1000 [00:00<?, ?it/s]

In [None]:
preds

In [None]:
cv_alpha = np.linspace(0.01, 100, 1000)[np.argmax(accs)]

In [None]:
model4 = Ridge(alpha=cv_alpha).fit(train, train_y)

In [None]:
auc(model4, 'test')

0.7945322793148881