In [None]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
import pandas as pd
import numpy as np
from nltk import word_tokenize
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional, GRU
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import binary_crossentropy

In [None]:
train_set = pd.read_csv("/content/training.csv",sep="\t")
trial_set = pd.read_csv("/content/trial.csv",sep="\t")

In [None]:
def tokenize(data):
	train_set['Text Transcription'] = train_set['Text Transcription'].apply(lambda x: word_tokenize(x.lower()))

In [None]:
def create_vocabulary(sentence_tokens):
	vocabulary = set()
	for tokens in sentence_tokens:
		vocabulary.update(tokens)

	vocabulary = list(vocabulary)
	word_to_id = {word: index for word, index in zip(vocabulary, range(len(vocabulary)))}
	return vocabulary, word_to_id

In [None]:
tokenize(train_set)

In [None]:
train_set.head()

Unnamed: 0,file_name,misogynous,shaming,stereotype,objectification,violence,Text Transcription
0,1.jpg,0,0,0,0,0,"[milk, milk.zip]"
1,10.jpg,1,0,0,0,1,"[roses, are, red, ,, violets, are, blue, if, y..."
2,1000.jpg,0,0,0,0,0,"[breaking, news, :, russia, releases, photo, o..."
3,10000.jpg,0,0,0,0,0,"[man, seeking, woman, ignad, 18, o]"
4,10006.jpg,0,0,0,0,0,"[me, explaining, the, deep, lore, of, ., j.r.r..."


In [None]:
sentences = train_set['Text Transcription'].values
labels = train_set.misogynous.values
vocabulary, word_to_id = create_vocabulary(sentences)

train_set['indices'] = train_set['Text Transcription'].apply(lambda x: np.array([word_to_id[i] for i in x]))
sentence_indices = train_set['indices'].values
len_sentences = train_set['Text Transcription'].apply(lambda x: len(x))
padded_sentences = pad_sequences(sentence_indices, int(sum(len_sentences)/len(len_sentences)))

#Vanila

In [None]:
x_train, x_test, y_train, y_test = train_test_split(padded_sentences, labels, test_size=0.2, random_state=42, stratify=labels)

model = Sequential()
model.add(Embedding(input_dim=len(vocabulary), output_dim=50, trainable=False))

In [None]:
model.add(LSTM(256))
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer=Adam(learning_rate=0.01),loss=binary_crossentropy, metrics=['accuracy'])

model.fit(x_train, y_train, epochs=15, batch_size=30, verbose=2)

Epoch 1/15
267/267 - 19s - loss: 0.6956 - accuracy: 0.5284 - 19s/epoch - 72ms/step
Epoch 2/15
267/267 - 16s - loss: 0.6979 - accuracy: 0.4939 - 16s/epoch - 60ms/step
Epoch 3/15
267/267 - 16s - loss: 0.6949 - accuracy: 0.5046 - 16s/epoch - 60ms/step
Epoch 4/15
267/267 - 16s - loss: 0.6942 - accuracy: 0.5054 - 16s/epoch - 61ms/step
Epoch 5/15
267/267 - 16s - loss: 0.6947 - accuracy: 0.4999 - 16s/epoch - 61ms/step
Epoch 6/15
267/267 - 16s - loss: 0.6952 - accuracy: 0.4964 - 16s/epoch - 61ms/step
Epoch 7/15
267/267 - 16s - loss: 0.6946 - accuracy: 0.4900 - 16s/epoch - 61ms/step
Epoch 8/15
267/267 - 16s - loss: 0.6936 - accuracy: 0.5031 - 16s/epoch - 60ms/step
Epoch 9/15
267/267 - 16s - loss: 0.6945 - accuracy: 0.4927 - 16s/epoch - 61ms/step
Epoch 10/15
267/267 - 16s - loss: 0.6936 - accuracy: 0.5060 - 16s/epoch - 61ms/step
Epoch 11/15
267/267 - 16s - loss: 0.6939 - accuracy: 0.5076 - 16s/epoch - 61ms/step
Epoch 12/15
267/267 - 16s - loss: 0.6940 - accuracy: 0.5307 - 16s/epoch - 61ms/step
E

<keras.callbacks.History at 0x7f83f9596dd0>

In [None]:
from sklearn.metrics import classification_report
predict_x=model.predict(x_test) 
classes_x=[1 if x[0]>0.5 else 0 for x in predict_x]
print(classification_report(y_test,classes_x))

              precision    recall  f1-score   support

           0       0.59      0.47      0.52      1000
           1       0.56      0.67      0.61      1000

    accuracy                           0.57      2000
   macro avg       0.58      0.57      0.57      2000
weighted avg       0.58      0.57      0.57      2000



#Word2Vec

In [None]:
import gensim
from gensim.models import word2vec

word2vec = gensim.models.Word2Vec(train_set['Text Transcription'], min_count = 1, size = 50, window = 5, sg=1)
word2vec_weights = word2vec.wv.syn0
word2vec_weights.shape

  """


(22415, 50)

In [None]:
model = Sequential()
model.add(Embedding(input_dim=len(vocabulary), output_dim=50, weights=[word2vec_weights],trainable=False))
model.add(LSTM(128))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer=Adam(learning_rate=0.01),loss=binary_crossentropy, metrics=['accuracy'])
model.fit(x_train, y_train, epochs=15, batch_size=30, verbose=2)

Epoch 1/15
267/267 - 8s - loss: 0.6915 - accuracy: 0.5324 - 8s/epoch - 28ms/step
Epoch 2/15
267/267 - 5s - loss: 0.6862 - accuracy: 0.5556 - 5s/epoch - 20ms/step
Epoch 3/15
267/267 - 5s - loss: 0.6830 - accuracy: 0.5574 - 5s/epoch - 20ms/step
Epoch 4/15
267/267 - 5s - loss: 0.6819 - accuracy: 0.5665 - 5s/epoch - 20ms/step
Epoch 5/15
267/267 - 5s - loss: 0.6814 - accuracy: 0.5651 - 5s/epoch - 20ms/step
Epoch 6/15
267/267 - 5s - loss: 0.6788 - accuracy: 0.5729 - 5s/epoch - 20ms/step
Epoch 7/15
267/267 - 5s - loss: 0.6773 - accuracy: 0.5681 - 5s/epoch - 20ms/step
Epoch 8/15
267/267 - 5s - loss: 0.6765 - accuracy: 0.5742 - 5s/epoch - 20ms/step
Epoch 9/15
267/267 - 5s - loss: 0.6736 - accuracy: 0.5749 - 5s/epoch - 20ms/step
Epoch 10/15
267/267 - 5s - loss: 0.6726 - accuracy: 0.5815 - 5s/epoch - 20ms/step
Epoch 11/15
267/267 - 5s - loss: 0.6703 - accuracy: 0.5856 - 5s/epoch - 20ms/step
Epoch 12/15
267/267 - 5s - loss: 0.6696 - accuracy: 0.5896 - 5s/epoch - 20ms/step
Epoch 13/15
267/267 - 5s 

<keras.callbacks.History at 0x7f83f373cfd0>

In [None]:
model.evaluate(x_test,y_test)



[0.6743536591529846, 0.5730000138282776]

In [None]:
from sklearn.metrics import classification_report
predict_x=model.predict(x_test) 
classes_x=[1 if x[0]>0.5 else 0 for x in predict_x]
print(classification_report(y_test,classes_x))

              precision    recall  f1-score   support

           0       0.58      0.55      0.56      1000
           1       0.57      0.59      0.58      1000

    accuracy                           0.57      2000
   macro avg       0.57      0.57      0.57      2000
weighted avg       0.57      0.57      0.57      2000



#GloVe

In [None]:
embeddings = dict()
with open('/content/glove.6B.50d.txt', 'r', encoding='utf-8') as doc:
    line = doc.readline()
    while line != '':
        line = line.rstrip('\n').lower()
        parts = line.split(' ')
        vals = np.array(parts[1:], dtype=np.float)
        if parts[0] in vocabulary:
            embeddings[parts[0]] = vals
        line = doc.readline()


In [None]:
embedding_matrix = np.zeros((len(vocabulary), 50))
for i in range(len(vocabulary)):
  if vocabulary[i] in embeddings.keys():
      embedding_matrix[i] = embeddings[vocabulary[i]]
  else:
      embedding_matrix[i] = np.random.standard_normal(50)

In [None]:
model = Sequential()
model.add(Embedding(input_dim=len(vocabulary), output_dim=50, weights=[embedding_matrix],trainable=False))
model.add(LSTM(128))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer=Adam(learning_rate=0.01),loss=binary_crossentropy, metrics=['accuracy'])
model.fit(x_train, y_train, epochs=15, batch_size=30, verbose=2)

Epoch 1/15
267/267 - 7s - loss: 0.6347 - accuracy: 0.6289 - 7s/epoch - 28ms/step
Epoch 2/15
267/267 - 5s - loss: 0.5294 - accuracy: 0.7377 - 5s/epoch - 20ms/step
Epoch 3/15
267/267 - 5s - loss: 0.4571 - accuracy: 0.7843 - 5s/epoch - 20ms/step
Epoch 4/15
267/267 - 5s - loss: 0.3788 - accuracy: 0.8315 - 5s/epoch - 20ms/step
Epoch 5/15
267/267 - 5s - loss: 0.3113 - accuracy: 0.8629 - 5s/epoch - 20ms/step
Epoch 6/15
267/267 - 5s - loss: 0.2571 - accuracy: 0.8899 - 5s/epoch - 20ms/step
Epoch 7/15
267/267 - 5s - loss: 0.1993 - accuracy: 0.9224 - 5s/epoch - 20ms/step
Epoch 8/15
267/267 - 5s - loss: 0.1910 - accuracy: 0.9240 - 5s/epoch - 20ms/step
Epoch 9/15
267/267 - 5s - loss: 0.2091 - accuracy: 0.9153 - 5s/epoch - 20ms/step
Epoch 10/15
267/267 - 5s - loss: 0.1536 - accuracy: 0.9401 - 5s/epoch - 20ms/step
Epoch 11/15
267/267 - 5s - loss: 0.1341 - accuracy: 0.9484 - 5s/epoch - 20ms/step
Epoch 12/15
267/267 - 5s - loss: 0.1423 - accuracy: 0.9456 - 5s/epoch - 20ms/step
Epoch 13/15
267/267 - 5s 

<keras.callbacks.History at 0x7f83f8b61250>

In [None]:
model.evaluate(x_test,y_test)



[1.0451955795288086, 0.718999981880188]

In [None]:
from sklearn.metrics import classification_report
predict_x=model.predict(x_test) 
classes_x=[1 if x[0]>0.5 else 0 for x in predict_x]
print(classification_report(y_test,classes_x))

              precision    recall  f1-score   support

           0       0.72      0.72      0.72      1000
           1       0.72      0.72      0.72      1000

    accuracy                           0.72      2000
   macro avg       0.72      0.72      0.72      2000
weighted avg       0.72      0.72      0.72      2000



Перформансите не се подобрени за разлика од првата лабораториска и покрај користење на длабоко учење, односно невронски мрежи. Сепак можеме да земеме во предвид дека не користиме cross-validation или не користиме никаков sampling за да балансираме dataset-от каде што може со комбинација од некои овие концепти би го подобриле значително моделот на невронските мрежи.

#Task 2

In [None]:
from sklearn.multioutput import MultiOutputClassifier
labels = train_set[['misogynous','shaming','stereotype','objectification','violence']]
x_train, x_test, y_train, y_test = train_test_split(padded_sentences, labels, test_size=0.2, random_state=42)	


In [None]:
from keras import backend as K
def full_multi_label_metric(y_true, y_pred):
    comp = K.equal(y_true, K.round(y_pred))
    return K.cast(K.all(comp, axis=-1), K.floatx())

##GloVe

In [None]:
model = Sequential()
model.add(Embedding(input_dim=len(vocabulary), output_dim=50, weights=[embedding_matrix],trainable=False))
model.add(LSTM(128))
model.add(Dense(5, activation='sigmoid'))
model.compile(optimizer=Adam(learning_rate=0.01),loss=binary_crossentropy, metrics=[full_multi_label_metric])
model.fit(x_train, y_train, epochs=15, batch_size=30, verbose=2)

Epoch 1/15
267/267 - 7s - loss: 0.4857 - full_multi_label_metric: 0.3625 - 7s/epoch - 28ms/step
Epoch 2/15
267/267 - 5s - loss: 0.4470 - full_multi_label_metric: 0.4384 - 5s/epoch - 20ms/step
Epoch 3/15
267/267 - 5s - loss: 0.4140 - full_multi_label_metric: 0.4694 - 5s/epoch - 20ms/step
Epoch 4/15
267/267 - 5s - loss: 0.3745 - full_multi_label_metric: 0.5111 - 5s/epoch - 20ms/step
Epoch 5/15
267/267 - 5s - loss: 0.3329 - full_multi_label_metric: 0.5493 - 5s/epoch - 20ms/step
Epoch 6/15
267/267 - 5s - loss: 0.2942 - full_multi_label_metric: 0.5993 - 5s/epoch - 20ms/step
Epoch 7/15
267/267 - 5s - loss: 0.2593 - full_multi_label_metric: 0.6325 - 5s/epoch - 20ms/step
Epoch 8/15
267/267 - 5s - loss: 0.2373 - full_multi_label_metric: 0.6640 - 5s/epoch - 20ms/step
Epoch 9/15
267/267 - 5s - loss: 0.2182 - full_multi_label_metric: 0.6809 - 5s/epoch - 20ms/step
Epoch 10/15
267/267 - 5s - loss: 0.2040 - full_multi_label_metric: 0.6986 - 5s/epoch - 20ms/step
Epoch 11/15
267/267 - 5s - loss: 0.1805

<keras.callbacks.History at 0x7f83f07c2fd0>

In [None]:
model.evaluate(x_test,y_test)



[0.7136902809143066, 0.4034999907016754]

In [None]:
predict_x=model.predict(x_test) 
multioutput = list()
for pred in predict_x:
  temp= list()
  for x in pred:
    if x>0.5:
      temp.append(1)
    else:
      temp.append(0)
  multioutput.append(temp)

In [None]:
TP=[0]*5
TN=[0]*5
FP=[0]*5
FN=[0]*5
for predict,true in zip(multioutput,y_test.values):
  for i in range(len(predict)):
    if predict[i]==1 and true[i]==1:
      TP[i]+=1
    elif predict[i]==1 and true[i]==0:
      FP[i]+=1
    elif predict[i]==0 and true[i]==0:
      TN[i]+=1
    elif predict[i]==0 and true[i]==1:
      FN[i]+=1

In [None]:
accuracy = np.add(TP,TN)/len(multioutput)
precision = np.add(TP,0)/np.add(TP,FP)
recall = np.add(TP,0)/np.add(TP,FN)
F1 = (2*precision*recall)/(precision+recall)

In [None]:
pd.DataFrame(data=[accuracy,precision,recall,F1],columns=y_test.columns,index=["Accuracy","Precision","Recall","F1"])

Unnamed: 0,misogynous,shaming,stereotype,objectification,violence
Accuracy,0.698,0.82,0.73,0.757,0.8815
Precision,0.69084,0.241706,0.523346,0.434568,0.268041
Recall,0.721116,0.203187,0.47695,0.406467,0.135417
F1,0.705653,0.220779,0.499072,0.420048,0.179931


##Word2Vec

In [None]:
model = Sequential()
model.add(Embedding(input_dim=len(vocabulary), output_dim=50, weights=[word2vec_weights],trainable=False))
model.add(LSTM(128))
model.add(Dense(5, activation='sigmoid'))
model.compile(optimizer=Adam(learning_rate=0.01),loss=binary_crossentropy, metrics=[full_multi_label_metric])
model.fit(x_train, y_train, epochs=15, batch_size=30, verbose=2)

Epoch 1/15
267/267 - 8s - loss: 0.5070 - full_multi_label_metric: 0.2477 - 8s/epoch - 28ms/step
Epoch 2/15
267/267 - 5s - loss: 0.5000 - full_multi_label_metric: 0.2689 - 5s/epoch - 20ms/step
Epoch 3/15
267/267 - 5s - loss: 0.4981 - full_multi_label_metric: 0.2766 - 5s/epoch - 20ms/step
Epoch 4/15
267/267 - 5s - loss: 0.4975 - full_multi_label_metric: 0.2906 - 5s/epoch - 20ms/step
Epoch 5/15
267/267 - 5s - loss: 0.4974 - full_multi_label_metric: 0.2952 - 5s/epoch - 20ms/step
Epoch 6/15
267/267 - 5s - loss: 0.4966 - full_multi_label_metric: 0.3000 - 5s/epoch - 20ms/step
Epoch 7/15
267/267 - 5s - loss: 0.4958 - full_multi_label_metric: 0.3018 - 5s/epoch - 20ms/step
Epoch 8/15
267/267 - 5s - loss: 0.4953 - full_multi_label_metric: 0.2962 - 5s/epoch - 20ms/step
Epoch 9/15
267/267 - 5s - loss: 0.4950 - full_multi_label_metric: 0.2941 - 5s/epoch - 20ms/step
Epoch 10/15
267/267 - 5s - loss: 0.4945 - full_multi_label_metric: 0.2931 - 5s/epoch - 20ms/step
Epoch 11/15
267/267 - 5s - loss: 0.4938

<keras.callbacks.History at 0x7f83ee766890>

In [None]:
model.evaluate(x_test,y_test)



[0.4966822862625122, 0.29899999499320984]

In [None]:
predict_x=model.predict(x_test) 
multioutput = list()
for pred in predict_x:
  temp= list()
  for x in pred:
    if x>0.5:
      temp.append(1)
    else:
      temp.append(0)
  multioutput.append(temp)

In [None]:
TP=[0]*5
TN=[0]*5
FP=[0]*5
FN=[0]*5
for predict,true in zip(multioutput,y_test.values):
  for i in range(len(predict)):
    if predict[i]==1 and true[i]==1:
      TP[i]+=1
    elif predict[i]==1 and true[i]==0:
      FP[i]+=1
    elif predict[i]==0 and true[i]==0:
      TN[i]+=1
    elif predict[i]==0 and true[i]==1:
      FN[i]+=1
accuracy = np.add(TP,TN)/len(multioutput)
precision = np.add(TP,0)/np.add(TP,FP)
recall = np.add(TP,0)/np.add(TP,FN)
F1 = (2*precision*recall)/(precision+recall)
pd.DataFrame(data=[accuracy,precision,recall,F1],columns=y_test.columns,index=["Accuracy","Precision","Recall","F1"])

  app.launch_new_instance()


Unnamed: 0,misogynous,shaming,stereotype,objectification,violence
Accuracy,0.5695,0.8745,0.717,0.7835,0.904
Precision,0.572589,,0.0,,
Recall,0.561753,0.0,0.0,0.0,0.0
F1,0.567119,,,,


Доста комплицирано е во однос кога се предвидува нешто Multi-label. Но, не се подобрени перформансите.
Во документацијата на Keras има точно како се користи метриката при тренирање на моделот, затоа направивме своја метрика односно моделот предвидува точност во целост на торката. Пример  
Ако актуелната торка од y_test  е  
[1,0,1,0,0]  
А ние сме предвиделе  
[1,0,0,0,0]  
Тогаш тоа го сметаме како неточно предвидено.
Но секако водиме евиденција и за поединечни лабели, а тоа е претставено во DataFrames.
