## U.S. Patent Phrase to Phrase Matching
### Help Identify Similar Phrases in U.S. Patents
https://www.kaggle.com/competitions/us-patent-phrase-to-phrase-matching/overview

Match patent phrases given the context (patent category). Result scores should be one of:
- 1.0: Very close match. This is typically an exact match except possibly for differences in conjugation, quantity (e.g. singular vs. plural), and addition or removal of stopwords (e.g. “the”, “and”, “or”)
- 0.75: Close synonym, e.g. “mobile phone” vs. “cellphone”. This also includes abbreviations, e.g. "TCP" -> "transmission control protocol"
- 0.5: Synonyms which don’t have the same meaning (same function, same properties). This includes broad-narrow (hyponym) and narrow-broad (hypernym) matches
- 0.25: Somewhat related, e.g. the two phrases are in the same high level domain but are not synonyms. This also includes antonyms
- 0.0: Unrelated

In [1]:
import os

IS_KAGGLE = os.environ.get('KAGGLE_KERNEL_RUN_TYPE')

if IS_KAGGLE:
    INPUT_DIR = '../input/us-patent-phrase-to-phrase-matching'
    DATA_OUTPUT_DIR = '.'
    MODEL_OUTPUT_DIR = '.'
else:
    # running locally
    INPUT_DIR = 'data'
    DATA_OUTPUT_DIR = 'data'
    MODEL_OUTPUT_DIR = '.'

In [2]:
print(INPUT_DIR, DATA_OUTPUT_DIR, MODEL_OUTPUT_DIR)

../input/us-patent-phrase-to-phrase-matching . .


## Data Exploration

In [3]:
import pandas as pd

In [4]:
train_df = pd.read_csv(os.path.join(INPUT_DIR, 'train.csv'))
test_df = pd.read_csv(os.path.join(INPUT_DIR, 'test.csv'))

In [5]:
train_df.head()

Unnamed: 0,id,anchor,target,context,score
0,37d61fd2272659b1,abatement,abatement of pollution,A47,0.5
1,7b9652b17b68b7a4,abatement,act of abating,A47,0.75
2,36d72442aefd8232,abatement,active catalyst,A47,0.25
3,5296b0c19e1ce60e,abatement,eliminating process,A47,0.5
4,54c1e3b9184cb5b6,abatement,forest region,A47,0.0


In [6]:
print('training records: {}'.format(len(train_df)))

training records: 36473


In [7]:
test_df.head()

Unnamed: 0,id,anchor,target,context
0,4112d61851461f60,opc drum,inorganic photoconductor drum,G02
1,09e418c93a776564,adjust gas flow,altering gas flow,F23
2,36baf228038e314b,lower trunnion,lower locating,B60
3,1f37ead645e7f0c8,cap component,upper portion,D06
4,71a5b6ad068d531f,neural stimulation,artificial neural network,H04


In [8]:
print('test records: {}'.format(len(test_df)))

test records: 36


## Data Pre-Processing

In [9]:
import nltk
import string
from nltk.corpus import stopwords, wordnet
from nltk.stem.wordnet import WordNetLemmatizer

In [10]:
lemmatizer = WordNetLemmatizer()

In [11]:
def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [12]:
def pre_process(text):
    '''
    convert to lowercase, remove punctuation, stop words, perform lemmatization
    '''
    text = text.lower()
    punctuation = string.punctuation
    text = ''.join(ch for ch in text if ch not in punctuation)
    
    text_without_stopwords = ' '.join(word for word in text.split(' ') if word not in stopwords.words('english'))
    if text_without_stopwords.strip():  # at least one word remains after stop word removal
        text = text_without_stopwords
    
    # part of speech tagging prior to lemmatization improves results
    pos = nltk.pos_tag(text.split())  # list of (word, pos_tag) tuples
    # replace pos_tag with WordNet POS tags, needed by the WordNetLemmatizer
    pos = [(w[0], get_wordnet_pos(w[1])) for w in pos]
    
    lemmatized = ' '.join(lemmatizer.lemmatize(w[0], w[1]) for w in pos)
    
    return lemmatized

In [13]:
train_anchors = [pre_process(a) for a in train_df.anchor]
train_targets = [pre_process(a) for a in train_df.target]
test_anchors = [pre_process(a) for a in test_df.anchor]
test_targets = [pre_process(a) for a in test_df.target]

In [14]:
train_df_processed = train_df.copy()
train_df_processed['anchor'] = train_anchors
train_df_processed['target'] = train_targets
train_df_processed.to_csv(os.path.join(DATA_OUTPUT_DIR, 'train_df_processed.csv'), index=False)

test_df_processed = test_df.copy()
test_df_processed['anchor'] = test_anchors
test_df_processed['target'] = test_targets
test_df_processed.to_csv(os.path.join(DATA_OUTPUT_DIR, 'test_df_processed.csv'), index=False)

In [15]:
# train_anchors_original = list(train_df['anchor'])
# train_targets_original = list(train_df['target'])

# print('anchors')
# for i in range(len(train_anchors)):
#     if train_anchors_original[i] != train_anchors[i]:
#         print(train_anchors_original[i], ',', train_anchors[i])

# print()

# print('targets')
# for i in range(len(train_targets)):
#     if train_targets_original[i] != train_targets[i]:
#         print(train_targets_original[i], ',', train_targets[i])

## Get Patent Symbol Descriptions

Ran the below in BigQuery on Google Cloud:

```
SELECT symbol, titleFull FROM `patents-public-data.cpc.definition_202201`
```

Exported as a BigQuery table in own project and saved as CSV in Google Drive. Downloaded here as data/patent_symbol_descriptions.csv

In [16]:
if IS_KAGGLE:
    patent_symbol_descriptions = pd.read_csv('../input/d/datasets/vikrambajaj/us-patent-phrase-to-phrase-matching/patent_symbol_descriptions.csv')
else:
    patent_symbol_descriptions = pd.read_csv(os.path.join(INPUT_DIR, 'patent_symbol_descriptions.csv'))

In [17]:
patent_symbol_descriptions.describe()

Unnamed: 0,symbol,titleFull
count,260690,260690
unique,260690,232207
top,A01B,Details
freq,1,283


In [18]:
patent_symbol_descriptions.head()

Unnamed: 0,symbol,titleFull
0,A01B,SOIL WORKING IN AGRICULTURE OR FORESTRY; PARTS...
1,A01C,PLANTING; SOWING; FERTILISING
2,A01D,HARVESTING; MOWING
3,A01F,PROCESSING OF HARVESTED PRODUCE; HAY OR STRAW ...
4,A01G,"HORTICULTURE; CULTIVATION OF VEGETABLES, FLOWE..."


In [19]:
# converting to map for easy replacement
symbol_description_map = {}

for i in range(len(patent_symbol_descriptions)):
    symbol_description_map[patent_symbol_descriptions.iloc[i]['symbol']] = patent_symbol_descriptions.iloc[i]['titleFull'].lower()

In [20]:
import json

with open(os.path.join(DATA_OUTPUT_DIR, './patent_symbol_description_map.json'), 'w', encoding='utf-8') as f:
    json.dump(symbol_description_map, f, indent=4)

In [21]:
train_df_processed['context'] = [symbol_description_map[context] for context in list(train_df_processed['context'])]
test_df_processed['context'] = [symbol_description_map[context] for context in list(test_df_processed['context'])]

In [22]:
# preprocessing the context column in the same way as the phrases
unique_train_contexts = list(set(train_df_processed.context))
unique_test_contexts = list(set(train_df_processed.context))

processed_unique_train_contexts = {c: pre_process(c) for c in unique_train_contexts}
processed_unique_test_contexts = {c: pre_process(c) for c in unique_test_contexts}

In [23]:
train_df_processed.context = [processed_unique_train_contexts[c] for c in list(train_df_processed.context)]
test_df_processed.context = [processed_unique_test_contexts[c] for c in list(test_df_processed.context)]

In [24]:
train_df_processed.to_csv(os.path.join(DATA_OUTPUT_DIR, 'train_df_processed.csv'), index=False)
test_df_processed.to_csv(os.path.join(DATA_OUTPUT_DIR, 'test_df_processed.csv'), index=False)

In [25]:
train_df_processed.head()

Unnamed: 0,id,anchor,target,context,score
0,37d61fd2272659b1,abatement,abatement pollution,furniture domestic article appliance coffee mi...,0.5
1,7b9652b17b68b7a4,abatement,act abate,furniture domestic article appliance coffee mi...,0.75
2,36d72442aefd8232,abatement,active catalyst,furniture domestic article appliance coffee mi...,0.25
3,5296b0c19e1ce60e,abatement,eliminate process,furniture domestic article appliance coffee mi...,0.5
4,54c1e3b9184cb5b6,abatement,forest region,furniture domestic article appliance coffee mi...,0.0


In [26]:
test_df_processed.head()

Unnamed: 0,id,anchor,target,context
0,4112d61851461f60,opc drum,inorganic photoconductor drum,optic
1,09e418c93a776564,adjust gas flow,alter gas flow,combustion apparatus combustion process
2,36baf228038e314b,low trunnion,low locating,vehicle general
3,1f37ead645e7f0c8,cap component,upper portion,treatment textile like launder flexible materi...
4,71a5b6ad068d531f,neural stimulation,artificial neural network,electric communication technique


In [27]:
labels = list(set(train_df_processed.score))

In [28]:
labels

[0.5, 0.75, 1.0, 0.25, 0.0]

The labels are a fixed set of values. Discretizing them into classes. PyTorch needs labels to start from 0.

In [29]:
label_map = {0.0: 0, 0.25: 1, 0.5: 2, 0.75: 3, 1.0: 4}

In [30]:
label_desc = {0: 'dissimilar', 1: 'somewhat-related', 2: 'hyponym-hypernym', 3: 'close-synonym', 4: 'very-close-exact-match'}

In [31]:
train_df_processed['score'] = [label_map[s] for s in list(train_df_processed.score)]

In [32]:
train_df_processed.to_csv(os.path.join(DATA_OUTPUT_DIR, 'train_df_processed.csv'), index=False)

In [33]:
# load csv if kernel restarts
train_df_processed = pd.read_csv(os.path.join(DATA_OUTPUT_DIR, 'train_df_processed.csv'))
test_df_processed = pd.read_csv(os.path.join(DATA_OUTPUT_DIR, 'test_df_processed.csv'))

In [34]:
# train-val split (80-20)
from sklearn.model_selection import train_test_split

X = train_df_processed.iloc[:, 1:-1]
y = train_df_processed.iloc[:, -1]

# split into train and val
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

X_test = test_df_processed.iloc[:, 1:]
y_test = []

In [35]:
X_train

Unnamed: 0,anchor,target,context
1715,antigen composition,antigen immunoassay,measure test
26508,pulp apparatus,image forming apparatus,treatment textile like launder flexible materi...
15967,hydrocarbyl substitute succinic,substitute succinic anhydride,petroleum gas coke industry technical gas cont...
17306,intermediate connection,intermediate eye color,print line machine typewriter stamp
1741,antigen composition,poetic composition,measure test
...,...,...,...
16850,inner peripheral,inner peripheral side,information storage
6265,committee,approval area,checkingdevices
11284,embed groove,elongate groove,combustion apparatus combustion process
860,aesthetic effect,aesthetic value,sport game amusement


In [36]:
X_val

Unnamed: 0,anchor,target,context
33511,transmit alarm,display indicator,signal
18670,lock formation,retain element,vehicle general
18049,lateral power,transducer,basic electric element
31660,spreader body,spreader,agriculture forestry animal husbandry hunt tra...
15573,high gradient magnetic separator,magnetic filtration,separation solid material use liquid use pneum...
...,...,...,...
5040,cervical support,gel pack,furniture domestic article appliance coffee mi...
33907,trommel screen,trommel screen,separation solid material use liquid use pneum...
9090,different conductivity,conductive,basic electronic circuitry
25999,prolog,slide window,basic electronic circuitry


In [37]:
X_test

Unnamed: 0,anchor,target,context
0,opc drum,inorganic photoconductor drum,optic
1,adjust gas flow,alter gas flow,combustion apparatus combustion process
2,low trunnion,low locating,vehicle general
3,cap component,upper portion,treatment textile like launder flexible materi...
4,neural stimulation,artificial neural network,electric communication technique
5,dry corn,dry corn starch,biochemistry beer spirit wine vinegar microbio...
6,tunnel capacitor,capacitor housing,information storage
7,angular contact bearing,contact therapy radiation,machine tool metalworking otherwise provide
8,produce liquid hydrocarbon,produce treat stream,petroleum gas coke industry technical gas cont...
9,diesel fuel tank,diesel fuel tank,combustion engine hotgas combustionproduct eng...


In [38]:
y_train

1715     2
26508    0
15967    1
17306    0
1741     0
        ..
16850    3
6265     0
11284    2
860      2
15795    1
Name: score, Length: 29178, dtype: int64

In [39]:
y_val

33511    0
18670    1
18049    1
31660    3
15573    2
        ..
5040     1
33907    4
9090     2
25999    0
22135    0
Name: score, Length: 7295, dtype: int64

In [40]:
y_test  # to be predicted for submission.csv

[]

## Model Training

### Extracting Embeddings and Creating Dataset
https://huggingface.co/AI-Growth-Lab/PatentSBERTa

In [41]:
import torch
import os

In [42]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


In [43]:
if IS_KAGGLE:
    !cp -r ../input/d/datasets/vikrambajaj/us-patent-phrase-to-phrase-matching/sentence-transformers-2.2.0/ ./
    !pip install ./sentence-transformers-2.2.0/sentence-transformers-2.2.0/

Processing ./sentence-transformers-2.2.0/sentence-transformers-2.2.0
  Preparing metadata (setup.py) ... [?25l- done
Building wheels for collected packages: sentence-transformers
  Building wheel for sentence-transformers (setup.py) ... [?25l- \ done
[?25h  Created wheel for sentence-transformers: filename=sentence_transformers-2.2.0-py3-none-any.whl size=120747 sha256=134c1f410e017b1ae563573ffb1fc9b10fb577c4727008180baadf8ba2a31340
  Stored in directory: /root/.cache/pip/wheels/6f/77/99/692662615373359c8bd6d65a64cb2afe2ae4fd598c57be6ebe
Successfully built sentence-transformers
Installing collected packages: sentence-transformers
Successfully installed sentence-transformers-2.2.0
[0m

In [44]:
if IS_KAGGLE:
    model = torch.load('../input/d/datasets/vikrambajaj/us-patent-phrase-to-phrase-matching/PatentSBERTa.pt')
else:
    from sentence_transformers import SentenceTransformer
    model = SentenceTransformer('AI-Growth-Lab/PatentSBERTa')
model.to(device)
model.eval()

SentenceTransformer(
  (0): Transformer({'max_seq_length': 512, 'do_lower_case': False}) with Transformer model: MPNetModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': True, 'pooling_mode_mean_tokens': False, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
)

In [45]:
def get_df_embeddings(df):
    new_df = pd.DataFrame()
    new_df['anchor'] = model.encode(df['anchor'].values).tolist()
    new_df['target'] = model.encode(df['target'].values).tolist()
    new_df['context'] = model.encode(df['context'].values).tolist()
    
    return new_df

In [46]:
# generate embeddings, takes a while
X_train = get_df_embeddings(X_train)
X_test = get_df_embeddings(X_test)
X_val = get_df_embeddings(X_val)

X_train.to_json(os.path.join(DATA_OUTPUT_DIR, 'X_train.json'))
X_test.to_json(os.path.join(DATA_OUTPUT_DIR, 'X_test.json'))
X_val.to_json(os.path.join(DATA_OUTPUT_DIR, 'X_val.json'))

Batches:   0%|          | 0/912 [00:00<?, ?it/s]

Batches:   0%|          | 0/912 [00:00<?, ?it/s]

Batches:   0%|          | 0/912 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/228 [00:00<?, ?it/s]

Batches:   0%|          | 0/228 [00:00<?, ?it/s]

Batches:   0%|          | 0/228 [00:00<?, ?it/s]

In [47]:
y_train = y_train.tolist()

In [48]:
y_val = y_val.tolist()

In [49]:
import numpy as np
# load json if kernel restarted
X_train = pd.read_json(os.path.join(DATA_OUTPUT_DIR, 'X_train.json'))
X_test = pd.read_json(os.path.join(DATA_OUTPUT_DIR, 'X_test.json'))
X_val = pd.read_json(os.path.join(DATA_OUTPUT_DIR, 'X_val.json'))

X_train_features = np.array([np.concatenate((X_train.iloc[i]['anchor'], X_train.iloc[i]['target'], X_train.iloc[i]['context'])) for i in range(len(X_train))])
X_val_features = np.array([np.concatenate((X_val.iloc[i]['anchor'], X_val.iloc[i]['target'], X_val.iloc[i]['context'])) for i in range(len(X_val))])
X_test_features = np.array([np.concatenate((X_test.iloc[i]['anchor'], X_test.iloc[i]['target'], X_test.iloc[i]['context'])) for i in range(len(X_test))])

In [50]:
X_train_features.shape

(29178, 2304)

## PCA for Dimensionality Reduction

In [51]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_features_scaled = scaler.fit_transform(X_train_features)
X_val_features_scaled = scaler.transform(X_val_features)
X_test_features_scaled = scaler.transform(X_test_features)

In [52]:
from sklearn.decomposition import PCA

pca = PCA(n_components='mle')
X_train_features = pca.fit_transform(X_train_features_scaled)
X_val_features = pca.transform(X_val_features_scaled)
X_test_features = pca.transform(X_test_features_scaled)

In [53]:
print(X_train_features.shape, X_val_features.shape, X_test_features.shape)

(29178, 1638) (7295, 1638) (36, 1638)


## Traditional (Non-Deep) Models

### Random Forest

In [54]:
# from sklearn.ensemble import RandomForestClassifier
# from joblib import load, dump

In [55]:
# rf = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 42)
# rf.fit(X_train_features, y_train)

In [56]:
# y_pred = rf.predict(X_test_features)

# submission_df = pd.DataFrame()
# submission_df['id'] = test_df['id']
# submission_df['score'] = y_pred

# submission_df.to_csv(os.path.join(DATA_OUTPUT_DIR, 'submission.csv'), index=False)

### XGBoost

In [57]:
# from xgboost.sklearn import XGBClassifier

In [58]:
# xgb = XGBClassifier(objective='multi:softmax', num_class=len(labels), seed=42)
# xgb.fit(X_train_features, y_train)

In [59]:
# def get_accuracy(actual, predicted):
#     match = 0
#     for i in range(len(actual)):
#         if actual[i] == predicted[i]:
#             match += 1
#     return round((match / len(actual)) * 100, 2)

In [60]:
# print('train set accuracy: {}'.format(get_accuracy(y_train, xgb.predict(X_train_features))))
# print('val set accuracy: {}'.format(get_accuracy(y_val, xgb.predict(X_val_features))))

In [61]:
# y_pred = xgb.predict(X_test_features)

# submission_df = pd.DataFrame()
# submission_df['id'] = test_df['id']
# submission_df['score'] = y_pred

# submission_df.to_csv(os.path.join(DATA_OUTPUT_DIR, 'submission.csv'), index=False)

## Training a Neural Network

In [62]:
from torch.utils.data import Dataset

In [63]:
class PatentsDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y
        
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, idx):
        return torch.tensor(self.X[idx,:], dtype=torch.float32), torch.tensor(int(self.y[idx]), dtype=torch.long)

In [64]:
train_ds = PatentsDataset(X_train_features, y_train)
val_ds = PatentsDataset(X_val_features, y_val)

In [65]:
train_ds[0]

(tensor([-7.8414e-01,  9.1548e+00,  1.7544e+01,  ..., -3.9770e-09,
         -1.7226e-08,  5.5407e-09]),
 tensor(2))

In [66]:
train_ds[0][0].shape

torch.Size([1638])

In [67]:
train_ds[0][1].shape

torch.Size([])

### Initialize Model

In [68]:
EPOCHS = 300
BATCH_SIZE = 16
LEARNING_RATE = 0.001
NUM_FEATURES = train_ds[0][0].size()[0]
NUM_CLASSES = 5

In [69]:
from torch.utils.data import DataLoader
import torch.nn as nn
import torch.optim as optim

train_loader = DataLoader(dataset=train_ds, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(dataset=val_ds, batch_size=1)

In [70]:
class MulticlassClassification(nn.Module):
    def __init__(self, num_feature, num_class):
        super(MulticlassClassification, self).__init__()
        
        self.layer_1 = nn.Linear(num_feature, 512)
        self.layer_2 = nn.Linear(512, 128)
        self.layer_3 = nn.Linear(128, 64)
        self.layer_out = nn.Linear(64, num_class) 
        
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p=0.5)
        self.batchnorm1 = nn.BatchNorm1d(512)
        self.batchnorm2 = nn.BatchNorm1d(128)
        self.batchnorm3 = nn.BatchNorm1d(64)
        
    def forward(self, x):
        x = self.layer_1(x)
        x = self.batchnorm1(x)
        x = self.relu(x)
        
        x = self.layer_2(x)
        x = self.batchnorm2(x)
        x = self.relu(x)
        x = self.dropout(x)
        
        x = self.layer_3(x)
        x = self.batchnorm3(x)
        x = self.relu(x)
        x = self.dropout(x)
        
        x = self.layer_out(x)
        
        return x

In [71]:
def multi_class_acc(y_pred, y_test):
    y_pred_softmax = torch.log_softmax(y_pred, dim = 1)
    _, y_pred_tags = torch.max(y_pred_softmax, dim = 1)    
    
    correct_pred = (y_pred_tags == y_test).float()
    acc = correct_pred.sum() / len(correct_pred)
    
    acc = torch.round(acc * 100)
    
    return acc

In [72]:
accuracy_stats = {
    'train': [],
    "val": []
}
loss_stats = {
    'train': [],
    "val": []
}

In [73]:
model = MulticlassClassification(num_feature=NUM_FEATURES, num_class=NUM_CLASSES)
model.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)
print(model)

MulticlassClassification(
  (layer_1): Linear(in_features=1638, out_features=512, bias=True)
  (layer_2): Linear(in_features=512, out_features=128, bias=True)
  (layer_3): Linear(in_features=128, out_features=64, bias=True)
  (layer_out): Linear(in_features=64, out_features=5, bias=True)
  (relu): ReLU()
  (dropout): Dropout(p=0.5, inplace=False)
  (batchnorm1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (batchnorm2): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (batchnorm3): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)


### Train Model

In [74]:
from tqdm import tqdm

print("begin training ...")
lowest_val_loss = float('inf')
trigger_count = 0
early_stopping_patience = 5  # stop training if val loss stops decreasing for these many epochs in a row

for e in tqdm(range(1, EPOCHS+1)):
    # TRAINING
    train_epoch_loss = 0
    train_epoch_acc = 0
    
    model.train()
    for X_train_batch, y_train_batch in train_loader:
        X_train_batch, y_train_batch = X_train_batch.to(device), y_train_batch.to(device)
        optimizer.zero_grad()
        
        y_train_pred = model(X_train_batch)
        
        train_loss = criterion(y_train_pred, y_train_batch)
        train_acc = multi_class_acc(y_train_pred, y_train_batch)
        
        train_loss.backward()
        optimizer.step()
        
        train_epoch_loss += train_loss.item()
        train_epoch_acc += train_acc.item()
        
    # VALIDATION    
    with torch.no_grad():
        val_epoch_loss = 0
        val_epoch_acc = 0
        
        model.eval()
        for X_val_batch, y_val_batch in val_loader:
            X_val_batch, y_val_batch = X_val_batch.to(device), y_val_batch.to(device)
            
            y_val_pred = model(X_val_batch)
                        
            val_loss = criterion(y_val_pred, y_val_batch)
            val_acc = multi_class_acc(y_val_pred, y_val_batch)
            
            val_epoch_loss += val_loss.item()
            val_epoch_acc += val_acc.item()

    loss_stats['train'].append(train_epoch_loss/len(train_loader))
    loss_stats['val'].append(val_epoch_loss/len(val_loader))
    accuracy_stats['train'].append(train_epoch_acc/len(train_loader))
    accuracy_stats['val'].append(val_epoch_acc/len(val_loader))
    
    print(f'Epoch {e+0:03}: | Train Loss: {train_epoch_loss/len(train_loader):.5f} | Val Loss: {val_epoch_loss/len(val_loader):.5f} | Train Acc: {train_epoch_acc/len(train_loader):.3f}| Val Acc: {val_epoch_acc/len(val_loader):.3f}')
    
    # early stopping code
    current_val_loss = val_epoch_loss/len(val_loader)
    if current_val_loss < lowest_val_loss:
        trigger_count = 0
        lowest_val_loss = current_val_loss
        # overwrite model
        torch.save(model, os.path.join(MODEL_OUTPUT_DIR, 'model.pt'))
    else:
        trigger_count += 1
        if trigger_count == early_stopping_patience:
            # stop training
            print('Early stopping!')
            break

begin training ...


  0%|          | 1/300 [00:11<58:12, 11.68s/it]

Epoch 001: | Train Loss: 1.34159 | Val Loss: 1.22915 | Train Acc: 42.030| Val Acc: 47.526


  1%|          | 2/300 [00:23<58:24, 11.76s/it]

Epoch 002: | Train Loss: 1.21382 | Val Loss: 1.15704 | Train Acc: 49.315| Val Acc: 51.241


  1%|          | 3/300 [00:35<58:59, 11.92s/it]

Epoch 003: | Train Loss: 1.11170 | Val Loss: 1.07525 | Train Acc: 54.405| Val Acc: 54.818


  1%|▏         | 4/300 [00:47<58:08, 11.78s/it]

Epoch 004: | Train Loss: 0.99511 | Val Loss: 1.01413 | Train Acc: 59.321| Val Acc: 57.711


  2%|▏         | 5/300 [00:58<57:42, 11.74s/it]

Epoch 005: | Train Loss: 0.89508 | Val Loss: 0.98420 | Train Acc: 63.991| Val Acc: 59.397


  2%|▏         | 6/300 [01:10<57:49, 11.80s/it]

Epoch 006: | Train Loss: 0.79861 | Val Loss: 0.97190 | Train Acc: 68.394| Val Acc: 60.548


  2%|▏         | 7/300 [01:22<57:09, 11.71s/it]

Epoch 007: | Train Loss: 0.71755 | Val Loss: 0.96792 | Train Acc: 72.103| Val Acc: 62.481


  3%|▎         | 8/300 [01:34<57:10, 11.75s/it]

Epoch 008: | Train Loss: 0.63999 | Val Loss: 1.01688 | Train Acc: 75.217| Val Acc: 62.303


  3%|▎         | 9/300 [01:45<56:30, 11.65s/it]

Epoch 009: | Train Loss: 0.58313 | Val Loss: 1.07669 | Train Acc: 78.018| Val Acc: 61.947


  3%|▎         | 10/300 [01:57<56:03, 11.60s/it]

Epoch 010: | Train Loss: 0.54518 | Val Loss: 1.06625 | Train Acc: 79.733| Val Acc: 62.111


  4%|▎         | 11/300 [02:08<56:19, 11.69s/it]

Epoch 011: | Train Loss: 0.49636 | Val Loss: 1.08844 | Train Acc: 81.599| Val Acc: 62.934


  4%|▎         | 11/300 [02:20<1:01:36, 12.79s/it]

Epoch 012: | Train Loss: 0.45516 | Val Loss: 1.14569 | Train Acc: 83.518| Val Acc: 62.426
Early stopping!





### Make Predictions on Test Data

In [75]:
X_test_features

array([[ 3.53872686e+00,  7.80702550e-01, -2.15776971e+00, ...,
         7.89009261e-08, -6.25709495e-08, -6.05090177e-08],
       [ 5.48576727e-01, -6.49467174e-01, -4.49106462e+00, ...,
         1.28850419e-06, -1.92967033e-06,  1.22951121e-06],
       [-2.02128934e+00,  3.47129296e+00, -4.01482024e+00, ...,
        -1.04833453e-06, -1.59681268e-07,  3.87858167e-08],
       ...,
       [ 2.88613829e+00,  1.63427611e+01,  8.92587167e+00, ...,
        -2.58923327e-07,  1.68084077e-07,  2.45080465e-06],
       [ 1.38133575e+01, -5.37211658e+00, -2.36574985e-01, ...,
         2.74567795e-06, -1.28885734e-07,  2.78832407e-06],
       [-5.17718101e+00, -3.65483383e+00, -9.90775469e+00, ...,
         1.77134871e-06, -5.93156708e-07,  8.16428843e-07]])

In [76]:
model = torch.load(os.path.join(MODEL_OUTPUT_DIR, 'model.pt'))
model.eval()

MulticlassClassification(
  (layer_1): Linear(in_features=1638, out_features=512, bias=True)
  (layer_2): Linear(in_features=512, out_features=128, bias=True)
  (layer_3): Linear(in_features=128, out_features=64, bias=True)
  (layer_out): Linear(in_features=64, out_features=5, bias=True)
  (relu): ReLU()
  (dropout): Dropout(p=0.5, inplace=False)
  (batchnorm1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (batchnorm2): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (batchnorm3): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)

In [77]:
test_predictions = []
for test_feature in X_test_features:
    test_feature_tensor = torch.tensor([test_feature], dtype=torch.float32).to(device)
    y_pred = model(test_feature_tensor)
    y_pred_softmax = torch.log_softmax(y_pred, dim = 1)
    _, y_pred_tag = torch.max(y_pred_softmax, dim = 1)    
    test_predictions.append(y_pred_tag[0].item())

  This is separate from the ipykernel package so we can avoid doing imports until


In [78]:
submission_df = pd.DataFrame()
submission_df['id'] = test_df['id']
submission_df['score'] = test_predictions

submission_df.to_csv(os.path.join(DATA_OUTPUT_DIR, 'submission.csv'), index=False)