In [1]:
!pip install -U nltk

Collecting nltk
  Downloading nltk-3.7-py3-none-any.whl (1.5 MB)
[K     |████████████████████████████████| 1.5 MB 5.5 MB/s 
Collecting regex>=2021.8.3
  Downloading regex-2022.3.15-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (749 kB)
[K     |████████████████████████████████| 749 kB 12.1 MB/s 
Installing collected packages: regex, nltk
  Attempting uninstall: regex
    Found existing installation: regex 2019.12.20
    Uninstalling regex-2019.12.20:
      Successfully uninstalled regex-2019.12.20
  Attempting uninstall: nltk
    Found existing installation: nltk 3.2.5
    Uninstalling nltk-3.2.5:
      Successfully uninstalled nltk-3.2.5
Successfully installed nltk-3.7 regex-2022.3.15


In [2]:
import pandas as pd
import re
import json
from tqdm import tqdm
from nltk.corpus import stopwords
import nltk
from math import factorial, log
from itertools import combinations
from collections import Counter
import numpy as np
import gc
nltk.download('stopwords')
nltk.download('punkt')

from nltk.util import pad_sequence
from nltk.util import ngrams
from nltk import word_tokenize, sent_tokenize, pos_tag
from nltk.translate.bleu_score import sentence_bleu

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential, load_model
from keras.layers import Embedding, Dense, Dropout, SimpleRNN, LSTM
from keras.callbacks import EarlyStopping
import tensorflow
from tensorflow.keras.optimizers import Adam
from keras.callbacks import ModelCheckpoint
from tensorflow.keras.utils import to_categorical

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [3]:
# from google.colab import drive
# drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [4]:
EPSILON = 1e-20
INF = 1e20
N = 3
K = 15
MAXLEN = 40
VSIZE = 9194
tokenizer = ''

In [5]:
def get_vocabulary(sentences, threshold=5):
    word_frequencies = {}
    for sentence in sentences:
        words = re.sub('\W', ' ', sentence.lower()).split()
        words = [w for w in words if len(w) > 0]
        for word in words:
            if word_frequencies.get(word) is not None:
                word_frequencies[word] += 1
            else:
                word_frequencies[word] = 1
    return {w: v for w, v in word_frequencies.items() if v >= threshold}

In [6]:
def is_valid_string(attribute_value):
    return not (attribute_value == None or pd.isnull(attribute_value) or \
                str(attribute_value) == "" or str(attribute_value) == "nan" or \
                len(attribute_value) == 0)

# Loading data 

In [7]:
load_paths = ['../../data/train_data.csv', '../../data/valid_data.csv', '../../data/test_data.csv']
# load_paths = ['/content/gdrive/My Drive/Nlp-project/train_data.csv', 
#               '/content/gdrive/My Drive/Nlp-project/valid_data.csv', 
#               '/content/gdrive/My Drive/Nlp-project/test_data.csv']
load_paths = ['./train_data.csv', './valid_data.csv', './test_data.csv']

train_data = pd.read_csv(load_paths[0])
valid_data = pd.read_csv(load_paths[1])
test_data = pd.read_csv(load_paths[2])

In [8]:
def filter_unwanted_rows(data):
    y = data['Label'].tolist()
    to_drop = [i for i in range(len(y)) if y[i] > 3]
    new_data = data.drop(labels=to_drop, axis=0)
    new_data = new_data[new_data['Source'].str.split().str.len().lt(201)]
    new_data = new_data[new_data['Source'].str.split().str.len().lt(201)]
    # return np.asarray(new_data['Source'][:250]), np.asarray(new_data['Label'][:250]), np.asarray(new_data['Reference'][:250])
    return np.asarray(new_data['Source']), np.asarray(new_data['Label']), np.asarray(new_data['Reference'])

In [9]:
X_train, y_train, X_train_ref = filter_unwanted_rows(train_data)
X_test, y_test, X_test_ref = filter_unwanted_rows(test_data)
X_valid, y_valid, X_valid_ref = filter_unwanted_rows(valid_data)

print(sorted([len(x) for x in X_train]))

print("Train labels ratio")
print(Counter(y_train.tolist()))
print("Test labels ratio")
print(Counter(y_test.tolist()))
print("Validation labels ratio")
print(Counter(y_valid.tolist()))

[50, 50, 50, 52, 52, 52, 52, 52, 52, 52, 52, 52, 53, 53, 53, 53, 53, 53, 53, 53, 54, 54, 54, 54, 55, 55, 55, 55, 55, 55, 56, 57, 57, 57, 58, 58, 58, 58, 58, 58, 59, 59, 59, 59, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 61, 62, 62, 62, 63, 63, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 66, 66, 66, 66, 66, 66, 66, 67, 67, 67, 67, 67, 67, 67, 67, 67, 68, 68, 68, 68, 68, 68, 68, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 70, 70, 71, 71, 71, 71, 71, 71, 71, 71, 71, 72, 72, 72, 72, 72, 72, 72, 72, 72, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 75, 75, 75, 75, 75, 75, 75, 76, 76, 76, 76, 77, 77, 77, 77, 77, 77, 77, 77, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 81, 81, 81, 81, 81, 81, 81, 81, 81, 81, 81, 81, 81,

In [10]:
VOCAB = get_vocabulary(X_train)
print(len(VOCAB))
TOTAL = sum(VOCAB.values())
print(TOTAL)

9194
474833


In [11]:
# Different techniques for tackling class imbalance
from imblearn.over_sampling import RandomOverSampler, SMOTE

def balance_data(x, y, _type=0):
    if _type == 1:
        ros = RandomOverSampler(random_state=42)
        return ros.fit_resample(x, y)
    elif _type == 2:
        smote = SMOTE()
        return smote.fit_resample(x, y)
    return x, y

# Language model

In [12]:
def get_seq_from_text(X_train, X_valid, X_test):
    global tokenizer
    # Keras tokenizer - fit on text data
    tokenizer = Tokenizer(num_words = VSIZE, oov_token='UNK')
    tokenizer.fit_on_texts(X_train)

    # Generating sequences from tokens for piece of text
    _X_train = [tokenizer.texts_to_sequences([text])[0] for text in X_train]
    _X_test = [tokenizer.texts_to_sequences([text])[0] for text in X_test]
    _X_valid = [tokenizer.texts_to_sequences([text])[0] for text in X_valid]
    return _X_train, _X_valid, _X_test

In [13]:
def get_inputs_and_outputs(_sequences):
    inputs = [_sequence[:i+1] for _sequence in _sequences for i in range(len(_sequence) - 1)]
    outputs = [_sequence[i+1] for _sequence in _sequences for i in range(len(_sequence) - 1)]
    return inputs, outputs

In [14]:
def load_batch_data(sequences, steps_per_epoch, batch_size):
    curr_batch_num = 0
    while True:
        if curr_batch_num >= steps_per_epoch:
            curr_batch_num = 0
            continue
        curr_inputs, curr_outputs = get_inputs_and_outputs(sequences[(curr_batch_num * batch_size):((curr_batch_num + 1) * batch_size)])
        curr_inputs = np.array(pad_sequences(curr_inputs, maxlen=MAXLEN, padding='pre'))
        curr_outputs = to_categorical(curr_outputs, num_classes=VSIZE)
        yield (np.array(curr_inputs), np.asarray(curr_outputs))
        curr_batch_num += 1

In [15]:
def define_LSTM_LM(embedding_dim=300, lstm_units=300):
    model = Sequential()
    model.add(Embedding(VSIZE, embedding_dim, input_length=MAXLEN))
    model.add(LSTM(lstm_units, return_sequences=False))
    model.add(Dense(VSIZE, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer=Adam(learning_rate=0.001))
    print(model.summary())
    return model

In [16]:
def define_RNN_LM(embedding_dim=300, rnn_units=300):
    model = Sequential()
    model.add(Embedding(VSIZE, embedding_dim, input_length=MAXLEN))
    model.add(SimpleRNN(rnn_units, return_sequences=False))
    model.add(Dense(VSIZE, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer=Adam(learning_rate=0.001))
    print(model.summary())
    return model

In [17]:
def train_model(model, X_train, X_valid, name, epochs=40):
    checkpoint = ModelCheckpoint(filepath=f'/content/gdrive/My Drive/Nlp-project/{name}.hdf5', 
                                 monitor="val_loss", save_best_only=True, verbose=1)
    SENT_PER_BATCH = 50

    train_steps_per_epoch = np.ceil(len(X_train) / SENT_PER_BATCH)
    train_data_gen = load_batch_data(X_train, train_steps_per_epoch, SENT_PER_BATCH)
    valid_steps_per_epoch = np.ceil(len(X_valid) / SENT_PER_BATCH)
    valid_data_gen = load_batch_data(X_valid, valid_steps_per_epoch, SENT_PER_BATCH)

    model.fit_generator(train_data_gen, epochs=epochs, steps_per_epoch=train_steps_per_epoch,
                                          validation_data=valid_data_gen, validation_steps=valid_steps_per_epoch,
                                          verbose = 1, callbacks=[checkpoint])
    return model

In [18]:
X_train_seq, X_valid_seq, X_test_seq = get_seq_from_text(X_train, X_valid, X_test)

In [19]:
# lstm_lm = define_LSTM_LM()
# lstm_lm = train_model(lstm_lm, X_train_seq, X_valid_seq, 'LSTM_LM')

In [20]:
# rnn_lm = define_RNN_LM()
# rnn_lm = train_model(rnn_lm, X_train_seq, X_valid_seq, 'RNN_LM', epochs=20)

# LM predictions and calculations

In [21]:
# lstm_lm = load_model('/content/gdrive/My Drive/Nlp-project/LSTM_LM.hdf5')
# rnn_lm = load_model('/content/gdrive/My Drive/Nlp-project/RNN_LM.hdf5')
# lstm_lm = load_model('./LSTM_LM.hdf5')
rnn_lm = load_model('./RNN_LM.hdf5')

In [22]:
def calculate_perplexity(model, _text):
    global tokenizer
    text = re.sub('\W', ' ', _text.lower()).strip()
    tokens = tokenizer.texts_to_sequences([text])[0]
    if len(tokens) < 2:
        return 1, 0.5
    x, y = np.array([tokens[:-1]]), np.array([tokens[-1]])
    x, y = np.array(pad_sequences(x, maxlen=MAXLEN, padding='pre')), np.asarray(to_categorical(y, num_classes=VSIZE))
    try:
        cross_entropy = model.evaluate(x, y, verbose=0)
        perplexity = np.exp(cross_entropy)
        probability = 1 / (perplexity ** y.shape[0])
        return perplexity, probability
    except Exception as e:
        with open('./errs.txt', 'a') as f:
            print(e, file=f)
        return INF, 0

In [23]:
def get_all_ngram_probabilities(model, given_text):
    text = given_text.lower()
    sentences = sent_tokenize(text)
    if len(sentences) == 0:
        return [], INF
    all_ngram_probabilities, all_perplexities = [], []
    for sent in sentences:
        words = word_tokenize(sent)
        curr_ngrams = list(ngrams(words, n=N, pad_left=True, pad_right=True, left_pad_symbol="<s>", right_pad_symbol="</s>"))
        probabilities = [max(EPSILON, calculate_perplexity(model, ' '.join(ngram))[1]) for ngram in curr_ngrams]
        all_ngram_probabilities.append(probabilities)
        try:
            perplexity = (1 / np.prod(np.asarray(probabilities))) ** (1 / len(probabilities))
        except:
            perplexity = INF
        all_perplexities.append(perplexity)
    return all_ngram_probabilities, sum(all_perplexities) / len(all_perplexities)

In [24]:
def get_prob_feature_vector(text, model):
    global K
    features = []
    all_ngram_probabilities, perplexity = get_all_ngram_probabilities(model, text) 
    all_ngram_probabilities = sorted(sum(all_ngram_probabilities, []))
    if len(all_ngram_probabilities) == 0:
        return [0 for _ in range(2*K)] + [INF]
    frequent_k, rarest_k = all_ngram_probabilities[-K:], all_ngram_probabilities[:K]
    if len(frequent_k) < K:
        median = frequent_k[len(frequent_k) // 2]
        for _ in range(K - len(frequent_k)):
            frequent_k.append(median)
        frequent_k = sorted(frequent_k)
    if len(rarest_k) < K:
        median = rarest_k[len(rarest_k) // 2]
        for _ in range(K - len(rarest_k)):
            rarest_k.append(median)
        rarest_k = sorted(rarest_k)
    features = features + frequent_k + rarest_k + [perplexity]
    return features

# Metrics for Fluency

In [25]:
def get_ROUGE_S(translation, reference, n=3):
    if not is_valid_string(translation) or not is_valid_string(reference):
        return 0
    trans_words = word_tokenize(translation.lower())[:MAXLEN]
    skip_gram_comb = list(combinations([i for i in range(len(trans_words))], n))
    trans_skip_grams = [[trans_words[skip_gram_comb[i][j]] for j in range(len(skip_gram_comb[i]))] for i in range(len(skip_gram_comb))]
    ref_words = word_tokenize(reference.lower())[:MAXLEN]
    skip_gram_comb = list(combinations([i for i in range(len(ref_words))], n))
    ref_skip_grams = [[ref_words[skip_gram_comb[i][j]] for j in range(len(skip_gram_comb[i]))] for i in range(len(skip_gram_comb))]

    del skip_gram_comb
    gc.collect()
    skip_count = len([gram for gram in ref_skip_grams if gram in trans_skip_grams])
    if ref_skip_grams == None or trans_skip_grams == None or len(ref_skip_grams) == 0 or len(trans_skip_grams) == 0:
        return 0
    R_skip = skip_count / len(ref_skip_grams)
    P_skip = skip_count / len(trans_skip_grams)
    if R_skip == 0:
        return 0
    beta = P_skip / R_skip
    denom = R_skip + (beta * beta * P_skip)
    if denom == 0:
        return 0
    rouge_s = ((1 + (beta * beta)) * R_skip * P_skip) / denom
    return rouge_s

In [26]:
def get_ROUGE_L(translation, reference):
    if not is_valid_string(translation) or not is_valid_string(reference):
        return 0
    trans_words, ref_words = word_tokenize(translation.lower())[:MAXLEN], word_tokenize(reference.lower())[:MAXLEN]
    m, n = len(trans_words), len(ref_words)
    if m == 0 or n == 0:
        return 0
    lcs = [[None]*(n+1) for i in range(m+1)]
    for i in range(m+1):
        for j in range(n+1):
            if i == 0 or j == 0:
                lcs[i][j] = 0
            elif trans_words[i-1] == ref_words[j-1]:
                lcs[i][j] = lcs[i-1][j-1] + 1
            else:
                lcs[i][j] = max(lcs[i-1][j] , lcs[i][j-1])
    ans = lcs[m][n]
    del lcs
    gc.collect()
    return ans

In [27]:
def get_BLEU_score(sentence, reference):
    if not is_valid_string(sentence) or not is_valid_string(reference):
        return 0
    sentence_tokens = word_tokenize(sentence.lower())[:MAXLEN]
    ref_tokens = word_tokenize(reference.lower())[:MAXLEN]
    return sentence_bleu([ref_tokens], sentence_tokens)

In [28]:
def get_SLOR(sentence, probability):
    if not is_valid_string(sentence):
        return 0
    sentence_tokens = word_tokenize(sentence.lower())[:MAXLEN]
    if len(sentence_tokens) == 0:
        return 0
    term_1 = (1 / len(sentence_tokens)) * log(probability)
    unigram_prob = 1
    for word in sentence_tokens:
        if VOCAB.get(word) is not None:
            unigram_prob *= (VOCAB[word] / TOTAL)
        else:
            unigram_prob = 0
            break
    if unigram_prob < EPSILON:
        unigram_prob = EPSILON
    term_2 = log(unigram_prob)
    return term_1 - term_2

In [29]:
def ngram_overlap(translation, reference, n=3):
    if not is_valid_string(translation) or not is_valid_string(reference):
        return 0
    trans_words = word_tokenize(translation.lower())[:MAXLEN]
    trans_ngrams = list(ngrams(trans_words, n=n, pad_left=True, pad_right=True, left_pad_symbol="<s>", right_pad_symbol="</s>"))
    ref_words = word_tokenize(reference.lower())[:MAXLEN]
    ref_ngrams = list(ngrams(ref_words, n=n, pad_left=True, pad_right=True, left_pad_symbol="<s>", right_pad_symbol="</s>"))
    intersect = len([i for i in ref_ngrams if i in trans_ngrams])
    union = len(ref_ngrams) + len(trans_ngrams) - intersect
    if union == 0:
        return 0
    return intersect / union

# Feature extraction and model training

In [None]:
from sklearn.linear_model import LogisticRegression

# Training on train set, testing on both validation and test sets - 70:30
train_features, train_labels, test_features, test_labels = [], [], [], []
rouge_s, rouge_l, bleu, ng_overlap, slor = [], [], [], [], []

for i in tqdm(range(len(X_train))):
    rs, rl, b, ng, s = get_ROUGE_S(X_train[i], X_train_ref[i], N), get_ROUGE_L(X_train[i], X_train_ref[i]), \
                      get_BLEU_score(X_train[i], X_train_ref[i]), ngram_overlap(X_train[i], X_train_ref[i], N), \
                      get_SLOR(X_train[i], calculate_perplexity(rnn_lm, X_train[i])[1])
    features = get_prob_feature_vector(X_train[i], rnn_lm) + [rs, rl, b, ng, s]
    train_features.append(features)
    train_labels.append(y_train[i])
    # if (i + 1) % 50 == 0:
    #     with open('/content/gdrive/My Drive/Nlp-project/train_log.json', 'w') as _f:
    #         print(i)
    #         json.dump({"train_features": np.array(train_features).tolist(), "train_labels": np.array(train_labels).tolist()}, _f)
    rouge_s.append(rs)
    rouge_l.append(rl)
    bleu.append(b)
    ng_overlap.append(ng)
    slor.append(s)
# with open('/content/gdrive/My Drive/Nlp-project/train_log.json', 'w') as _f:
#     json.dump({"train_features": np.array(train_features).tolist(), "train_labels": np.array(train_labels).tolist()}, _f)


for i in tqdm(range(len(X_test))):
    rs, rl, b, ng, s = get_ROUGE_S(X_test[i], X_test_ref[i], N), get_ROUGE_L(X_test[i], X_test_ref[i]), \
                      get_BLEU_score(X_test[i], X_test_ref[i]), ngram_overlap(X_test[i], X_test_ref[i], N), \
                      get_SLOR(X_test[i], calculate_perplexity(rnn_lm, X_test[i])[1])
    features = get_prob_feature_vector(X_test[i], rnn_lm) + [rs, rl, b, ng, s]
    test_features.append(features)
    test_labels.append(y_test[i])
    # if (i + 1) % 50 == 0:
    #     with open('/content/gdrive/My Drive/Nlp-project/test_log.json', 'w') as _f:
    #         print(i)
    #         json.dump({"test_features": np.array(test_features).tolist(), "test_labels": np.array(test_labels).tolist()}, _f)
    rouge_s.append(rs)
    rouge_l.append(rl)
    bleu.append(b)
    ng_overlap.append(ng)
    slor.append(s)
# with open('/content/gdrive/My Drive/Nlp-project/test_log.json', 'w') as _f:
#     json.dump({"test_features": np.array(test_features).tolist(), "test_labels": np.array(test_labels).tolist()}, _f)

for i in tqdm(range(len(X_valid))):
    rs, rl, b, ng, s = get_ROUGE_S(X_valid[i], X_valid_ref[i], N), get_ROUGE_L(X_valid[i], X_valid_ref[i]), \
                      get_BLEU_score(X_valid[i], X_valid_ref[i]), ngram_overlap(X_valid[i], X_valid_ref[i], N), \
                      get_SLOR(X_valid[i], calculate_perplexity(rnn_lm, X_valid[i])[1])
    features = get_prob_feature_vector(X_valid[i], rnn_lm) + [rs, rl, b, ng, s]
    test_features.append(features)
    test_labels.append(y_valid[i])
    # if (i + 1) % 50 == 0:
    #     with open('/content/gdrive/My Drive/Nlp-project/test_log_7.json', 'w') as _f:
    #         print(i)
    #         json.dump({"test_features": np.array(test_features).tolist(), "test_labels": np.array(test_labels).tolist()}, _f)
    rouge_s.append(rs)
    rouge_l.append(rl)
    bleu.append(b)
    ng_overlap.append(ng)
    slor.append(s)
# with open('/content/gdrive/My Drive/Nlp-project/test_log_7.json', 'w') as _f:
#     json.dump({"test_features": np.array(test_features).tolist(), "test_labels": np.array(test_labels).tolist()}, _f)

The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
  2%|▏         | 50/2202 [02:40<1:47:42,  3.00s/it]

1299


  5%|▍         | 100/2202 [05:20<1:39:33,  2.84s/it]

1349


  7%|▋         | 150/2202 [07:54<1:20:19,  2.35s/it]

1399


  9%|▉         | 200/2202 [10:34<1:42:45,  3.08s/it]

1449


 11%|█▏        | 250/2202 [13:19<1:31:18,  2.81s/it]

1499


 14%|█▎        | 300/2202 [15:55<1:41:38,  3.21s/it]

1549


 16%|█▌        | 350/2202 [18:34<1:24:49,  2.75s/it]

1599


 18%|█▊        | 400/2202 [21:04<1:12:05,  2.40s/it]

1649


 20%|██        | 450/2202 [23:38<1:17:05,  2.64s/it]

1699


 23%|██▎       | 500/2202 [25:58<1:14:30,  2.63s/it]

1749


 25%|██▍       | 550/2202 [28:11<1:04:58,  2.36s/it]

1799


 27%|██▋       | 600/2202 [30:41<1:25:27,  3.20s/it]

1849


 30%|██▉       | 650/2202 [33:15<1:25:28,  3.30s/it]

1899


 32%|███▏      | 700/2202 [35:44<1:28:54,  3.55s/it]

1949


 34%|███▍      | 750/2202 [38:16<1:23:33,  3.45s/it]

1999


 36%|███▋      | 800/2202 [40:31<53:56,  2.31s/it]

2049


  del sys.path[0]
 39%|███▊      | 850/2202 [43:05<1:05:17,  2.90s/it]

2099


 41%|████      | 900/2202 [45:30<1:13:29,  3.39s/it]

2149


 43%|████▎     | 950/2202 [47:52<59:14,  2.84s/it]  

2199


 45%|████▌     | 1000/2202 [50:26<1:02:24,  3.12s/it]

2249


 48%|████▊     | 1050/2202 [52:53<1:11:22,  3.72s/it]

2299


 50%|████▉     | 1100/2202 [55:32<51:52,  2.82s/it]

2349


 52%|█████▏    | 1150/2202 [58:00<48:32,  2.77s/it]

2399


 54%|█████▍    | 1200/2202 [1:00:41<1:06:29,  3.98s/it]

2449


 55%|█████▍    | 1210/2202 [1:01:13<54:25,  3.29s/it]  

# Performance analysis

In [None]:
from scipy.stats import pearsonr

def find_correlation(scores, y):
    corr, _ = pearsonr(scores, y)
    return corr

In [None]:
from sklearn.metrics import ConfusionMatrixDisplay, accuracy_score, precision_score, f1_score, recall_score, classification_report
import matplotlib.pyplot as plt
import json

statements = {0: "Without oversampling", 1: "With random oversampling", 2: "With SMOTE"}

def get_metrics(b, ytrue, ypred, file_name, folder_name):
    print('\n\n')
    print(statements[b] + '\n')
    print(classification_report(ytrue, ypred))
    ret = classification_report(ytrue, ypred, output_dict=True)
    ConfusionMatrixDisplay.from_predictions(ytrue, ypred)
    plt.savefig(f"{folder_name}/{file_name}_conf.png",dpi=300)
    plt.show()
    print('\n\n')
    with open(f"{folder_name}/{file_name}_stats.json", "w") as f:
        json.dump(ret, f, indent=4) 

In [None]:
# Calculating correlations for various scores
print("Correlation between ROUGE_L and labels: ", find_correlation(rouge_l, train_labels + test_labels))
print("Correlation between ROUGE_S and labels: ", find_correlation(rouge_s, train_labels + test_labels))
print("Correlation between BLEU score and labels: ", find_correlation(bleu, train_labels + test_labels))
print("Correlation between SLOR and labels: ", find_correlation(slor, train_labels + test_labels))
print("Correlation between N-gram overlap and labels: ", find_correlation(ng_overlap, train_labels + test_labels))

# Classification with different architectures

In [None]:
# RF
from sklearn.ensemble import RandomForestClassifier

for t in range(3):
    train_f, train_l = balance_data(train_features, train_labels, t)

    for i in range(len(train_f)):
        for j in range(len(train_f[i])):
            if str(train_f[i][j]) == 'inf':
                train_f[i][j] = INF
    for i in range(len(test_features)):
        for j in range(len(test_features[i])):
            if str(test_features[i][j]) == 'inf':
                test_features[i][j] = INF

    print("Class distribution:", Counter(train_l))
    rf_model = RandomForestClassifier(random_state=123).fit(train_f, train_l)
    print("\nFor training set\n")
    train_pred = rf_model.predict(train_f)
    get_metrics(t, train_l, train_pred, f"RF_train_{t}", "RF")
    print("\nFor test set\n")
    test_pred = rf_model.predict(test_features)
    get_metrics(t, test_labels, test_pred, f"RF_test_{t}", "RF")
    print('-'*210)

In [None]:
# DT
from sklearn.tree import DecisionTreeClassifier

for t in range(3):
    train_f, train_l = balance_data(train_features, train_labels, t)

    for i in range(len(train_f)):
        for j in range(len(train_f[i])):
            if str(train_f[i][j]) == 'inf':
                train_f[i][j] = INF
    for i in range(len(test_features)):
        for j in range(len(test_features[i])):
            if str(test_features[i][j]) == 'inf':
                test_features[i][j] = INF

    print("Class distribution:", Counter(train_l))
    dt_model = DecisionTreeClassifier(random_state=123).fit(train_f, train_l)
    print("\nFor training set\n")
    train_pred = dt_model.predict(train_f)
    get_metrics(t, train_l, train_pred, f"DT_train_{t}", "DT")
    print("\nFor test set\n")
    test_pred = dt_model.predict(test_features)
    get_metrics(t, test_labels, test_pred, f"DT_test_{t}", "DT")
    print('-'*210)

In [None]:
# KNN
from sklearn.neighbors import KNeighborsClassifier

for t in range(3):
    train_f, train_l = balance_data(train_features, train_labels, t)

    for i in range(len(train_f)):
        for j in range(len(train_f[i])):
            if str(train_f[i][j]) == 'inf':
                train_f[i][j] = INF
    for i in range(len(test_features)):
        for j in range(len(test_features[i])):
            if str(test_features[i][j]) == 'inf':
                test_features[i][j] = INF

    print("Class distribution:", Counter(train_l))
    knn_model = KNeighborsClassifier(n_neighbors=203).fit(train_f, train_l)
    print("\nFor training set\n")
    train_pred = knn_model.predict(train_f)
    get_metrics(t, train_l, train_pred, f"KNN_train_{t}", "KNN")
    print("\nFor test set\n")
    test_pred = knn_model.predict(test_features)
    get_metrics(t, test_labels, test_pred, f"KNN_test_{t}", "KNN")
    print('-'*210)

In [None]:
# SVM
from sklearn import svm

for t in range(3):
    train_f, train_l = balance_data(train_features, train_labels, t)

    for i in range(len(train_f)):
        for j in range(len(train_f[i])):
            if str(train_f[i][j]) == 'inf':
                train_f[i][j] = INF
    for i in range(len(test_features)):
        for j in range(len(test_features[i])):
            if str(test_features[i][j]) == 'inf':
                test_features[i][j] = INF

    print("Class distribution:", Counter(train_l))
    svm_model = svm.SVC().fit(train_f, train_l)
    print("\nFor training set\n")
    train_pred = svm_model.predict(train_f)
    get_metrics(t, train_l, train_pred, f"SVM_train_{t}", "SVM")
    print("\nFor test set\n")
    test_pred = svm_model.predict(test_features)
    get_metrics(t, test_labels, test_pred, f"SVM_test_{t}", "SVM")
    print('-'*210)

# Logging results

In [None]:
!zip -r SVM.zip SVM
!zip -r DT.zip DT
!zip -r RF.zip RF
!zip -r KNN.zip KNN

In [None]:
# from google.colab import files
# files.download("SVM.zip")
# files.download("DT.zip")
# files.download("RF.zip")
# files.download("KNN.zip")