In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
%cd drive/MyDrive/mini-task/

/content/drive/MyDrive/mini-task


In [3]:
import numpy as np
import nltk

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

from tensorflow import keras

from Datasets import SequenceDataset
from models import SequenceModel
from utility import top_k_metric, calculating_class_weights, get_weighted_loss

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [4]:
train_ds = SequenceDataset(mode="train", tag_func=nltk.pos_tag_sents)
val_ds = SequenceDataset(mode="valid", tag_func=nltk.pos_tag_sents, input_tokenizer=train_ds.input_tokenizer, target_tokenizer=train_ds.target_tokenizer)
test_ds = SequenceDataset(mode="test", tag_func=nltk.pos_tag_sents, input_tokenizer=train_ds.input_tokenizer, target_tokenizer=train_ds.target_tokenizer)

# TFIDF First

In [5]:
train_data, train_labels = train_ds.get_data_target(whole_dialog=False, data_type="tfidf")
val_data, val_labels = val_ds.get_data_target(whole_dialog=False, data_type="tfidf")
test_data, test_labels = test_ds.get_data_target(whole_dialog=False, data_type="tfidf")

finished loading corpus
finished loading descriptions
finished extracting contexts
finished extracting targets (OOCs)
finished loading corpus
finished loading descriptions
finished extracting contexts
finished extracting targets (OOCs)
finished loading corpus
finished loading descriptions
finished extracting contexts
finished extracting targets (OOCs)


## Model

In [6]:
class_weights = calculating_class_weights(train_labels)

In [7]:
model = keras.Sequential([
    keras.layers.Dense(1024, activation="relu"),
    keras.layers.Dense(1024, activation="relu"),
    keras.layers.Dense(1024, activation="relu"),
    keras.layers.Dense(train_labels.shape[-1], activation="sigmoid"),
])

model.compile(optimizer=keras.optimizers.Adam(learning_rate=1e-5),
              loss=get_weighted_loss(class_weights),
              metrics=[
                  keras.metrics.Precision(name="precision", top_k=5),
                  keras.metrics.Recall(name="recall", top_k=5),
              ])

In [8]:
model.fit(train_data, train_labels, validation_data=(val_data, val_labels), batch_size=32, epochs=50)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<tensorflow.python.keras.callbacks.History at 0x7f006b044588>

## Evaluation

In [9]:
train_pred = model.predict(train_data)
val_pred = model.predict(val_data)
test_pred = model.predict(test_data)

In [10]:
for i in range(1, 11):
    metric = top_k_metric(train_pred, train_labels, i)
    print(f"top {i}: \nprecision: {metric[0]}\nrecall: {metric[1]}\nf1: {metric[2]}")

top 1: 
precision: 0.5394797956339991
recall: 0.2478263188776871
f1: 0.33963229650206517
top 2: 
precision: 0.41593125870877845
recall: 0.38214114258281323
f1: 0.39832086958939145
top 3: 
precision: 0.334881560613098
recall: 0.4615138422147544
f1: 0.3881300047103156
top 4: 
precision: 0.2810322805387831
recall: 0.5164026244199071
f1: 0.36398157721590374
top 5: 
precision: 0.2423130515559684
recall: 0.5565690510481677
f1: 0.3376316598443542
top 6: 
precision: 0.21377148165350673
recall: 0.5892142742838854
f1: 0.3137221488518724
top 7: 
precision: 0.1919083007099728
recall: 0.6171120712647357
f1: 0.29277119105161264
top 8: 
precision: 0.174480376219229
recall: 0.6412225956152985
f1: 0.27431740127562954
top 9: 
precision: 0.16012540640966094
recall: 0.6620259241478637
f1: 0.257877512856475
top 10: 
precision: 0.14779377612633535
recall: 0.6789352963140769
f1: 0.2427455729638494


In [11]:
for i in range(1, 11):
    metric = top_k_metric(val_pred, val_labels, i)
    print(f"top {i}: \nprecision: {metric[0]}\nrecall: {metric[1]}\nf1: {metric[2]}")

top 1: 
precision: 0.160075329566855
recall: 0.07734303912647862
f1: 0.10429447852760737
top 2: 
precision: 0.1224105461393597
recall: 0.11828935395814377
f1: 0.1203146691346599
top 3: 
precision: 0.1016949152542373
recall: 0.14740673339399454
f1: 0.12035661218424963
top 4: 
precision: 0.08945386064030132
recall: 0.17288444040036396
f1: 0.1179025752404592
top 5: 
precision: 0.07984934086629002
recall: 0.19290263876251137
f1: 0.11294619072988812
top 6: 
precision: 0.07250470809792843
recall: 0.21019108280254778
f1: 0.10781796966161027
top 7: 
precision: 0.06725854183481302
recall: 0.22747952684258416
f1: 0.10382059800664452
top 8: 
precision: 0.06320621468926553
recall: 0.2443130118289354
f1: 0.10043014774639984
top 9: 
precision: 0.05890353630466625
recall: 0.25614194722474976
f1: 0.0957808778496087
top 10: 
precision: 0.05696798493408663
recall: 0.2752502274795268
f1: 0.09439850210641285


In [12]:
for i in range(1, 11):
    metric = top_k_metric(test_pred, test_labels, i)
    print(f"top {i}: \nprecision: {metric[0]}\nrecall: {metric[1]}\nf1: {metric[2]}")

top 1: 
precision: 0.1440677966101695
recall: 0.06737120211360634
f1: 0.09180918091809182
top 2: 
precision: 0.1224105461393597
recall: 0.11448701012769705
f1: 0.11831626848691694
top 3: 
precision: 0.10326428123038292
recall: 0.1448701012769705
f1: 0.12057907275059555
top 4: 
precision: 0.09157250470809793
recall: 0.1712901805372083
f1: 0.11934345758551926
top 5: 
precision: 0.0839924670433145
recall: 0.19638925583443417
f1: 0.11766257749637253
top 6: 
precision: 0.07689893283113622
recall: 0.21576398062527521
f1: 0.1133865555941224
top 7: 
precision: 0.07021791767554479
recall: 0.22985468956406868
f1: 0.10757341576506954
top 8: 
precision: 0.06450094161958568
recall: 0.2413033905768384
f1: 0.10179251416364818
top 9: 
precision: 0.06172839506172839
recall: 0.25979744605900484
f1: 0.09975483980049032
top 10: 
precision: 0.05847457627118644
recall: 0.27344782034346105
f1: 0.09634628810798232


# BOW Now

In [13]:
train_data, train_labels = train_ds.get_data_target(whole_dialog=False, data_type="binary")
val_data, val_labels = val_ds.get_data_target(whole_dialog=False, data_type="binary")
test_data, test_labels = test_ds.get_data_target(whole_dialog=False, data_type="binary")

class_weights = calculating_class_weights(train_labels)

model = keras.Sequential([
    keras.layers.Dense(1024, activation="relu"),
    keras.layers.Dense(1024, activation="relu"),
    keras.layers.Dense(1024, activation="relu"),
    keras.layers.Dense(train_labels.shape[-1], activation="sigmoid"),
])

model.compile(optimizer=keras.optimizers.Adam(learning_rate=1e-5),
              loss=get_weighted_loss(class_weights),
              metrics=[
                  keras.metrics.Precision(name="precision", top_k=5),
                  keras.metrics.Recall(name="recall", top_k=5),
              ])

model.fit(train_data, train_labels, validation_data=(val_data, val_labels), batch_size=32, epochs=50)

finished loading corpus
finished loading descriptions
finished extracting contexts
finished extracting targets (OOCs)
finished loading corpus
finished loading descriptions
finished extracting contexts
finished extracting targets (OOCs)
finished loading corpus
finished loading descriptions
finished extracting contexts
finished extracting targets (OOCs)
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<tensorflow.python.keras.callbacks.History at 0x7f001536bb00>

In [14]:
train_pred = model.predict(train_data)
val_pred = model.predict(val_data)
test_pred = model.predict(test_data)

In [15]:
for i in range(1, 11):
    metric = top_k_metric(train_pred, train_labels, i)
    print(f"top {i}: \nprecision: {metric[0]}\nrecall: {metric[1]}\nf1: {metric[2]}")

top 1: 
precision: 0.22886669763121226
recall: 0.10513682189150264
f1: 0.1440842136042984
top 2: 
precision: 0.1816651184393869
recall: 0.16690670507281166
f1: 0.17397347863556753
top 3: 
precision: 0.15640966093822573
recall: 0.2155544887181949
f1: 0.18127986003633673
top 4: 
precision: 0.13710520204366
recall: 0.2519336427161679
f1: 0.1775730801767083
top 5: 
precision: 0.12296795169530887
recall: 0.2824451912305969
f1: 0.17133981587846037
top 6: 
precision: 0.11154977550704444
recall: 0.30746252733770735
f1: 0.16370581803206524
top 7: 
precision: 0.10349346426912613
recall: 0.3327999146530112
f1: 0.1578874112689957
top 8: 
precision: 0.09666744078030655
recall: 0.3552568410945751
f1: 0.1519801923713246
top 9: 
precision: 0.09048098260824688
recall: 0.3740865205099483
f1: 0.1457171056049036
top 10: 
precision: 0.08553181607059916
recall: 0.3929161999253214
f1: 0.1404827066665395


In [16]:
for i in range(1, 11):
    metric = top_k_metric(val_pred, val_labels, i)
    print(f"top {i}: \nprecision: {metric[0]}\nrecall: {metric[1]}\nf1: {metric[2]}")

top 1: 
precision: 0.12994350282485875
recall: 0.06278434940855324
f1: 0.08466257668711656
top 2: 
precision: 0.1007532956685499
recall: 0.09736123748862602
f1: 0.09902822767237389
top 3: 
precision: 0.08788449466415568
recall: 0.12738853503184713
f1: 0.10401188707280833
top 4: 
precision: 0.0765065913370998
recall: 0.14786169244767972
f1: 0.10083772882407695
top 5: 
precision: 0.067984934086629
recall: 0.16424021838034578
f1: 0.09616409163558871
top 6: 
precision: 0.061833019460138104
recall: 0.17925386715195632
f1: 0.09194865810968494
top 7: 
precision: 0.05743879472693032
recall: 0.1942675159235669
f1: 0.08866279069767444
top 8: 
precision: 0.052848399246704335
recall: 0.20427661510464057
f1: 0.08397232092762298
top 9: 
precision: 0.04938271604938271
recall: 0.21474067333939945
f1: 0.08029942157196325
top 10: 
precision: 0.047080979284369114
recall: 0.22747952684258416
f1: 0.07801529099703543


In [17]:
for i in range(1, 11):
    metric = top_k_metric(test_pred, test_labels, i)
    print(f"top {i}: \nprecision: {metric[0]}\nrecall: {metric[1]}\nf1: {metric[2]}")

top 1: 
precision: 0.11581920903954802
recall: 0.05416116248348745
f1: 0.0738073807380738
top 2: 
precision: 0.10404896421845575
recall: 0.09731395860854249
f1: 0.1005688282138794
top 3: 
precision: 0.0903954802259887
recall: 0.12681638044914134
f1: 0.10555250137438153
top 4: 
precision: 0.07909604519774012
recall: 0.14795244385733158
f1: 0.10308329498389325
top 5: 
precision: 0.0711864406779661
recall: 0.166446499339498
f1: 0.0997229916897507
top 6: 
precision: 0.06528562460765851
recall: 0.1831792162043153
f1: 0.09626287168807128
top 7: 
precision: 0.05905299973096583
recall: 0.1933069132540731
f1: 0.0904688304997424
top 8: 
precision: 0.05520244821092279
recall: 0.20651695288419197
f1: 0.08711804588093248
top 9: 
precision: 0.05252144800167399
recall: 0.22104799647732276
f1: 0.08487615183024769
top 10: 
precision: 0.04962335216572505
recall: 0.23205636283575518
f1: 0.08176246994026841
