In [37]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/playground-series-s3e16/sample_submission.csv
/kaggle/input/playground-series-s3e16/train.csv
/kaggle/input/playground-series-s3e16/test.csv
/kaggle/input/crab-ckpt-47epoch/model_0_47


**Configuration**

In [38]:
learning_rate = 0.0001
n_epoch = 30000
batch_size = 100
valid_ratio = 0.8


In [39]:
# load csv as dataframe
train_data=pd.read_csv('/kaggle/input/playground-series-s3e16/train.csv')
print("train data")
print(train_data.tail())
test_data=pd.read_csv('/kaggle/input/playground-series-s3e16/test.csv')
print("test data")
print(test_data.head())

train data
          id Sex  Length  Diameter  Height     Weight  Shucked Weight  \
74046  74046   F  1.6625    1.2625  0.4375  50.660556       20.680960   
74047  74047   I  1.0750    0.8625  0.2750  10.446791        4.323299   
74048  74048   F  1.4875    1.2000  0.4125  29.483480       12.303683   
74049  74049   I  1.2125    0.9625  0.3125  16.768729        8.972617   
74050  74050   I  0.9125    0.6750  0.2000   5.386405        2.055339   

       Viscera Weight  Shell Weight  Age  
74046       10.361742     12.332033   10  
74047        2.296310      3.543687    6  
74048        7.540967      8.079607   10  
74049        2.919999      4.280774    8  
74050        1.034757      1.700970    6  
test data
      id Sex  Length  Diameter  Height     Weight  Shucked Weight  \
0  74051   I  1.0500    0.7625  0.2750   8.618248        3.657085   
1  74052   I  1.1625    0.8875  0.2750  15.507176        7.030676   
2  74053   F  1.2875    0.9875  0.3250  14.571643        5.556502   
3  740

In [40]:
# analyze the correlation
print(train_data.corr(numeric_only=True))
# feature extraction
feature_list = ["Length","Diameter","Height","Weight","Shucked Weight","Viscera Weight","Shell Weight"]

                      id    Length  Diameter    Height    Weight  \
id              1.000000  0.000165  0.000290  0.000967 -0.000910   
Length          0.000165  1.000000  0.989437  0.918352  0.936374   
Diameter        0.000290  0.989437  1.000000  0.921353  0.938249   
Height          0.000967  0.918352  0.921353  1.000000  0.901775   
Weight         -0.000910  0.936374  0.938249  0.901775  1.000000   
Shucked Weight -0.000801  0.915516  0.914199  0.864083  0.971267   
Viscera Weight -0.000640  0.917855  0.918351  0.883127  0.971062   
Shell Weight   -0.000816  0.916957  0.922688  0.903398  0.965525   
Age             0.000089  0.612843  0.621256  0.638067  0.601195   

                Shucked Weight  Viscera Weight  Shell Weight       Age  
id                   -0.000801       -0.000640     -0.000816  0.000089  
Length                0.915516        0.917855      0.916957  0.612843  
Diameter              0.914199        0.918351      0.922688  0.621256  
Height                0.864

**custom dataset**

In [41]:
from torch.utils.data import Dataset, DataLoader

class CrabAgeDataset(Dataset):
    def __init__(self, data):
        """
        data: dataframe
        """
        self.data = data
    def __len__(self):
        return len(self.data)
    def __getitem__(self, idx):
        """
        return the row as a dictionary 
        """
        if torch.is_tensor(idx):
            idx = idx.tolist()
        #idx=str(idx)
        sample=self.data.iloc[idx]
        return sample.to_dict()

#randomly split train data into training set and validation set
valid_data=train_data.sample(frac=valid_ratio)
train_data=train_data.drop(valid_data.index)
#load dataframe into dataset
train_data=CrabAgeDataset(train_data)
valid_data=CrabAgeDataset(valid_data)
#test_data=CrabAgeDataset(test_data)

**Model**

In [42]:
import torch.nn as nn
class LinearRegression(nn.Module):
    def __init__(self, input_dim=7, output_dim=1):
        super(LinearRegression, self).__init__()
        self.layers=nn.Sequential(
            nn.Linear(input_dim,64),
            nn.ReLU(),
            nn.Linear(64,32),
            nn.ReLU(),
            nn.Linear(32,16),
            nn.ReLU(),
            nn.Linear(16,1)
        )
    def forward(self,x):
        return self.layers(x)
    
model=LinearRegression()

**loss function and optimizer**

In [43]:
import torch
loss_fn = nn.L1Loss()#MAE
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

**Training**

In [44]:
train_dataloader = DataLoader(train_data, batch_size=batch_size,shuffle=True, num_workers=0)
valid_dataloader = DataLoader(valid_data, batch_size=batch_size,shuffle=True, num_workers=0)
#test_dataloader = DataLoader(test_data, batch_size=batch_size,shuffle=True, num_workers=0)


In [45]:
print(type(train_data))

<class '__main__.CrabAgeDataset'>


In [46]:
def train_one_epoch(epoch_index):
    running_loss=0
    last_loss=0
    
    for i, sample in enumerate(train_dataloader):
               
        # Zero your gradients for every batch!
        optimizer.zero_grad()
    
        #get inputs and labelss
        inputs=[]
        labels=sample['Age']
        batch_num=len(labels)
        labels=torch.tensor(labels,dtype=torch.float32)
        #labels=torch.transpose(labels,dim0=0,dim)
        #print(labels.shape)
        labels=labels.view(batch_num,1)
        #feature extraction
        for feature in feature_list:
            inputs.append(torch.tensor(sample[feature],dtype=torch.float32))
        #inputs=torch.FloatTensor(inputs)
    

        
        # Make predictions for this batch
        #print(inputs)
        inputs=torch.stack(inputs,dim=1)
        
        inputs.to(torch.device("cuda"))
        labels.to(torch.device("cuda"))
        if i==1:
            pass
            #print(batch_num)
            #print(inputs.shape)
            #print(labels.shape)
        #inputs=torch.transpose(inputs,dim0=0,dim1=1)
        #print(inputs.shape)
        #print(labels)
        #print(inputs)
        outputs = model(inputs)


        # Compute the loss and its gradients
        loss = loss_fn(outputs, labels)
        loss.backward()

        # Adjust learning weights
        optimizer.step()

        # Gather data and report
        running_loss += loss.item()
        if i % 1000 == 999:
            last_loss = running_loss / 1000 # loss per batch
            print('  batch {} loss: {}'.format(i + 1, last_loss))
            tb_x = epoch_index * len(train_loader) + i + 1
            tb_writer.add_scalar('Loss/train', last_loss, tb_x)
            running_loss = 0.

    return last_loss
    

In [47]:
#import datetime
# Initializing in a separate cell so we can easily add more epochs to the same run
#timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
#writer = SummaryWriter('runs/fashion_trainer_{}'.format(timestamp))
epoch_number = 0

EPOCHS = 5

best_vloss = 1_000_000.

for epoch in range(EPOCHS):
    print('EPOCH {}:'.format(epoch_number + 1))

    # Make sure gradient tracking is on, and do a pass over the data
    model.train(True)
    avg_loss = train_one_epoch(epoch_number)


    running_vloss = 0.0
    # Set the model to evaluation mode, disabling dropout and using population
    # statistics for batch normalization.
    model.eval()

    # Disable gradient computation and reduce memory consumption.
    with torch.no_grad():
        for i, vsample in enumerate(valid_dataloader):
            #get inputs and labelss
            vinputs=[]
            vlabels=vsample['Age']
            batch_num=len(vlabels)
            vlabels=torch.tensor(vlabels,dtype=torch.float32)
            #labels=torch.transpose(labels,dim0=0,dim)
            #print(labels.shape)
            vlabels=vlabels.view(batch_num,1)
            #feature extraction
            for feature in feature_list:
                vinputs.append(torch.tensor(vsample[feature],dtype=torch.float32))
            #inputs=torch.FloatTensor(inputs)
            vinputs=torch.stack(vinputs,dim=1)    
            voutputs = model(vinputs)
            vloss = loss_fn(voutputs, vlabels)
            running_vloss += vloss

    avg_vloss = running_vloss / (i + 1)
    print('LOSS train {} valid {}'.format(avg_loss, avg_vloss))

    # Log the running loss averaged per batch
    # for both training and validation
    """
    writer.add_scalars('Training vs. Validation Loss',
                    { 'Training' : avg_loss, 'Validation' : avg_vloss },
                    epoch_number + 1)
    writer.flush()
    """

    # Track best performance, and save the model's state
    if avg_vloss < best_vloss:
        best_vloss = avg_vloss
        model_path = 'model_{}_{}'.format(0, epoch_number)
        torch.save(model.state_dict(), model_path)

    epoch_number += 1

EPOCH 1:


  labels=torch.tensor(labels,dtype=torch.float32)
  inputs.append(torch.tensor(sample[feature],dtype=torch.float32))
  vlabels=torch.tensor(vlabels,dtype=torch.float32)
  vinputs.append(torch.tensor(vsample[feature],dtype=torch.float32))


LOSS train 0 valid 5.507346153259277
EPOCH 2:
LOSS train 0 valid 3.128234624862671
EPOCH 3:
LOSS train 0 valid 3.0290915966033936
EPOCH 4:
LOSS train 0 valid 2.918623447418213
EPOCH 5:
LOSS train 0 valid 2.7789225578308105


**inference**

In [48]:
our_model=LinearRegression(7,1)
our_model.load_state_dict(torch.load("/kaggle/input/crab-ckpt-47epoch/model_0_47"))
#print(test_data.head())
test_inputs=[]
for i in test_data.index:
    features=[]
    for f in feature_list:
        features.append(test_data[f][i])
    test_inputs.append(torch.tensor(features,dtype=torch.float32))
test_inputs=torch.stack(test_inputs,dim=1)
test_inputs=torch.transpose(test_inputs,0,1)
print(test_inputs.shape)
test_results=our_model(test_inputs)
print(test_results.shape)
print(test_results)
#for i, sample in enumerate(test_data):
    #test_inputs=[]
    #for feature in feature_list:
    #inputs.append(torch.tensor(sample[feature],dtype=torch.float32))
        #inputs=torch.FloatTensor(inputs)
    

        
        # Make predictions for this batch
        #print(inputs)
        #inputs=torch.stack(inputs,dim=1)

KeyboardInterrupt: 

In [None]:
path = '/kaggle/working/output.txt'
f = open(path, 'w')
print('id,yield', file=f)
for i in test_data.index:
    print("{},{}".format(i,test_results[i].item()),file=f)
    #print(i,1,",",test_results[i].item(), file=f)
print('etc.', file=f)
f.close()

In [50]:
yield_list=[]
for i in test_data.index:
    yield_list.append(test_results[i].item())
result={
    "id":test_data["id"],
    "yield":yield_list
}
result=pd.DataFrame(result)
print(result)
result.to_csv("/kaggle/working/result.csv")
#test_data.to_csv("/kaggle/working/test.csv")

           id      yield
0       74051   7.543072
1       74052   7.853662
2       74053   9.673781
3       74054   9.180499
4       74055   7.557554
...       ...        ...
49363  123414   8.671449
49364  123415   7.805567
49365  123416  11.743745
49366  123417   9.339592
49367  123418  12.010289

[49368 rows x 2 columns]
