In [1]:
import numpy as np
import pandas as pd
import torch
from torch import nn 
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader
from sklearn.model_selection import train_test_split
import os


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# modules here are updated everytime before running any cell
%load_ext autoreload
%autoreload 1
%aimport pre_processing_funcs
%aimport class_object
%aimport training
%aimport tuning

### Reading the dataset files and extracting useful info 

In [3]:
pd.set_option('display.max_columns', None)

train = pd.read_csv('train_processed.csv')   
author_features = pd.read_csv('author_features.csv')
mean_features = pd.read_csv('train_mean_features_by_author.csv')

In [4]:
author_features.rename(columns={ author_features.columns[0]: "author" }, inplace = True)
# extracting longest sentence lenghth, this zill be useful for padding
max_len = np.max(author_features['max sentence lenghth'].values)

author_all_features  = pd.concat([author_features,mean_features], axis = 1)

# remove the columns that are redundant features, as well as columns that had little 
# importance according to the random forrest results
author_all_features.drop(['mean word lenghth','max word lenghth', 'min word lenghth','max sentence lenghth',
                          'min sentence lenghth','max word lenghth','min word lenghth','semicolon freq',
                          'n_punct','punct_prop', 'semi_colon', 'coma', 'interrogation',
                          'n_stop_words', 'stop_words_prop','n_syllabe', 'DT','author',
                          'NNS', 'IN', 'VBG', 'PRP$', 'MD','VB','CC','n_words',
                          'mean syllables per word' ], axis=1, inplace = True)


author_all_features


Unnamed: 0,mean sentence lenghth,syllables freq,commas freq,stop words freq,n_character,NN,RB,VBD,PRP,label
0,29.381266,5.143682,0.303245,1.804133,118.161646,5.042532,1.624684,1.328354,1.051013,2.0
1,30.873292,5.214659,0.197323,1.733783,129.134871,5.439574,1.632831,2.034073,1.074712,1.0
2,31.241893,5.030632,0.255164,1.841542,125.485606,5.264725,1.33587,1.920582,1.538882,0.0


In [25]:
# we create 3 added feature vectors each associated to an author, and we allocate them to the corresponding training data
feat_vect_2 = author_all_features.to_numpy()[0][:-1]
feat_vect_1 = author_all_features.to_numpy()[1][:-1]
feat_vect_0 = author_all_features.to_numpy()[2][:-1]

label_dict={'EAP':2, 'HPL':1, 'MWS':0}
nr_classes = 3

train_feat_vectors = []
for label in train.label.values:
    if label == 0:
        train_feat_vectors.append(feat_vect_0)
    elif label == 1:
        train_feat_vectors.append(feat_vect_1)
    elif label == 2:
        train_feat_vectors.append(feat_vect_2)

print(len(train_feat_vectors))

19579


In [6]:
test = pd.read_csv('test_features.csv') 
test_ids = test.id.values
test_tokenized_sents = test.tokenized_sents.values
test.drop(['id','text', 'text_low','tokenized_sents', 'semicolon freq',
            'text_all','n_punct','punct_prop', 'without_punct',
            'semi_colon','coma', 'interrogation', 'n_stop_words', 'stop_words_prop',
            'n_syllabe', 'IN','PRP$', 'VBG','JJ',
            'CC' ], axis=1, inplace = True)

# rearranging columns to match train feature vectors 
test_cols = ['n_words', 'syllables freq', 'commas freq' , 'stop words freq', 'n_character',
            'NN','RB','VBD','PRP']
test = test[test_cols]

test.head()


Unnamed: 0,n_words,syllables freq,commas freq,stop words freq,n_character,NN,RB,VBD,PRP
0,22,5.2,0.4,1.8,92,5.0,1.0,2.0,1.0
1,69,5.0,0.352941,2.0,269,16.0,4.0,5.0,1.0
2,36,5.333333,0.111111,1.777778,157,5.0,2.0,3.0,2.0
3,46,4.727273,0.363636,1.909091,183,8.0,2.0,3.0,1.0
4,12,4.333333,0.0,2.0,43,2.0,1.0,0.0,0.0


In [23]:
# we create added feature vectors for each test line
test_feat_vectors = np.empty([len(test), 9])
test_numpy = test.to_numpy()
for i in test.index:
    test_feat_vectors[i] = test_numpy[i]
test_feat_vectors.shape


(8392, 9)

In [8]:
#we embed all test sentences with word2vec
w2v_embeddings, vocab_size= pre_processing_funcs.word2vec(test_tokenized_sents)

In [9]:
#pad all embeddings to max len
padded_test_embeddings = pre_processing_funcs.pad_to_max(w2v_embeddings, max_len)

In [10]:
#same with train data, embed then pad
train_embeddings, vocab_size= pre_processing_funcs.word2vec(train['tokenized_sents'])
padded_train_embeddings = pre_processing_funcs.pad_to_max(train_embeddings, max_len)

In [11]:
print(padded_train_embeddings.shape)
print(padded_test_embeddings.shape)

torch.Size([19579, 876, 200])
torch.Size([8392, 876, 200])


In [12]:
train_labels = train.label.values
print(train_labels.shape)

(19579,)


### loading the tensor datasets

In [26]:
X_train, X_valid, y_train, y_valid, feat_train, feat_valid = train_test_split(padded_train_embeddings, train_labels, train_feat_vectors, test_size=0.25, random_state=42)

# create tensor datasets
train_data = TensorDataset(X_train, torch.LongTensor(y_train) )   
valid_data = TensorDataset(X_valid, torch.LongTensor(y_valid) )


RuntimeError: [enforce fail at C:\cb\pytorch_1000000000000\work\c10\core\impl\alloc_cpu.cpp:81] data. DefaultCPUAllocator: not enough memory: you tried to allocate 3430416000 bytes.

### making the tuning dictionary that holds individual values to be tested

In [16]:
simple_tune_dict = {}
simple_tune_dict["lr_values"] = [0.001]#,0.0001],0.01]
simple_tune_dict["hidden_dim_values"] = [9,24] #this value nacessary for including added features at the dense layer
simple_tune_dict["n_layers"] = [1,2]
simple_tune_dict["cells"] = ['lstm']#,'ellman']#,'gru']
simple_tune_dict["bidirectional"] = [False,True]
simple_tune_dict["batch_size"] = [32,64]
simple_tune_dict["patience"] = [3] #early stopping metric
print(simple_tune_dict)


{'lr_values': [0.001], 'hidden_dim_values': [9, 24], 'n_layers': [1, 2], 'cells': ['lstm'], 'bidirectional': [False, True], 'batch_size': [32, 64], 'patience': [3]}


### model creation

In [28]:
if __name__ == '__main__':
    
    model = tuning.tune_model_grid(simple_tune_dict , max_len,
                            train_data , valid_data , 
                            task = "classif", nr_classes = nr_classes)


0.001 9 1 lstm False
self outputs  3
--------------------------------


KeyboardInterrupt: 