# Importing the data

In [1]:
import os
import pandas as pd
import numpy as np

In [2]:
DATASET_DIR = "./dataset/"
SAVE_DIR = './'

In [3]:
X = pd.read_csv(os.path.join(DATASET_DIR, 'training_set_rel3.tsv'), sep='\t', encoding='ISO-8859-1')
y = X['domain1_score']
X = X.dropna(axis=1)
X = X.drop(columns=['rater1_domain1', 'rater2_domain1'])

X.head()

Unnamed: 0,essay_id,essay_set,essay,domain1_score
0,1,1,"Dear local newspaper, I think effects computer...",8
1,2,1,"Dear @CAPS1 @CAPS2, I believe that using compu...",9
2,3,1,"Dear, @CAPS1 @CAPS2 @CAPS3 More and more peopl...",7
3,4,1,"Dear Local Newspaper, @CAPS1 I have found that...",10
4,5,1,"Dear @LOCATION1, I know having computers has a...",8


In [4]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

X['score'] = scaler.fit_transform(X['domain1_score'].values.reshape(-1, 1))

new_min = 0
new_max = 100
X['score'] = (X['score'] - X['score'].min()) / (X['score'].max() - X['score'].min()) * (new_max - new_min) + new_min
y = np.round(X['score'])

X.head()

Unnamed: 0,essay_id,essay_set,essay,domain1_score,score
0,1,1,"Dear local newspaper, I think effects computer...",8,13.333333
1,2,1,"Dear @CAPS1 @CAPS2, I believe that using compu...",9,15.0
2,3,1,"Dear, @CAPS1 @CAPS2 @CAPS3 More and more peopl...",7,11.666667
3,4,1,"Dear Local Newspaper, @CAPS1 I have found that...",10,16.666667
4,5,1,"Dear @LOCATION1, I know having computers has a...",8,13.333333


In [5]:
y

0        13.0
1        15.0
2        12.0
3        17.0
4        13.0
         ... 
12971    58.0
12972    53.0
12973    67.0
12974    67.0
12975    67.0
Name: score, Length: 12976, dtype: float64

# Preprocessing the data

In [6]:
import numpy as np
import nltk
import re
from nltk.corpus import stopwords
from gensim.models import Word2Vec

def essay_to_wordlist(essay_v, remove_stopwords):
    """Remove the tagged labels and word tokenize the sentence."""
    essay_v = re.sub("[^a-zA-Z]", " ", essay_v)
    words = essay_v.lower().split()
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        words = [w for w in words if not w in stops]
    return (words)

def essay_to_sentences(essay_v, remove_stopwords):
    """Sentence tokenize the essay and call essay_to_wordlist() for word tokenization."""
    tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
    raw_sentences = tokenizer.tokenize(essay_v.strip())
    sentences = []
    for raw_sentence in raw_sentences:
        if len(raw_sentence) > 0:
            sentences.append(essay_to_wordlist(raw_sentence, remove_stopwords))
    return sentences

def makeFeatureVec(words, model, num_features):
    """Make Feature Vector from the words list of an Essay."""
    featureVec = np.zeros((num_features,),dtype="float32")
    num_words = 0.
    # index2word_set = set(model.wv.index2word)
    for word in words:
        if word in model:
            num_words += 1
            featureVec = np.add(featureVec, model[word])       
    featureVec = np.divide(featureVec,num_words)
    return featureVec

def getAvgFeatureVecs(essays, model, num_features):
    """Main function to generate the word vectors for word2vec model."""
    counter = 0
    essayFeatureVecs = np.zeros((len(essays),num_features),dtype="float32")
    for essay in essays:
        essayFeatureVecs[counter] = makeFeatureVec(essay, model, num_features)
        counter = counter + 1
    return essayFeatureVecs

# Defining the Model

In [41]:
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Lambda, Flatten
from tensorflow.keras.models import Sequential, load_model, model_from_config
import tensorflow.keras.backend as K

def get_model():
    """Define the model."""
    model = Sequential()
    model.add(LSTM(200, dropout=0.4, recurrent_dropout=0.4, input_shape=[1, 200], return_sequences=True))
    model.add(LSTM(64, recurrent_dropout=0.4))
    model.add(Dropout(0.5))
    model.add(Dense(1, activation='relu'))

    model.compile(loss='mean_squared_error', optimizer='rmsprop', metrics=['mae'])
    model.summary()

    return model

In [None]:
corpus = []
for essay in X['essay']:
    corpus.append(essay_to_wordlist(essay, True))

embedding_dict={}

with open('./glove.6B/glove.6B.200d.txt','r', encoding="utf8") as f:
    for line in f:
        values = line.split()
        word = values[0]
        vectors = np.asarray(values[1:],'float32')
        embedding_dict[word] = vectors
        
# with open('./glove.6B/glove.6B.100d.txt','r', encoding="utf8") as f:
#     for line in f:
#         values = line.split()
#         word = values[0]
#         vectors = np.asarray(values[1:],'float32')
#         embedding_dict[word] = vectors

# with open('./glove.6B/glove.6B.50d.txt','r', encoding="utf8") as f:
#     for line in f:
#         values = line.split()
#         word = values[0]
#         vectors = np.asarray(values[1:],'float32')
#         embedding_dict[word] = vectors

# Using 50d Glove embedding

In [22]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

train_essays = X_train['essay']
test_essays = X_test['essay']

sentences = []

for essay in train_essays:
    sentences += essay_to_sentences(essay, remove_stopwords=True)

num_features = 50     
model = embedding_dict

clean_train_essays = []
for essay_v in train_essays:
    clean_train_essays.append(essay_to_wordlist(essay_v, remove_stopwords=True))
trainDataVecs = getAvgFeatureVecs(clean_train_essays, model, num_features)

clean_test_essays = []
for essay_v in test_essays:
    clean_test_essays.append(essay_to_wordlist(essay_v, remove_stopwords=True))
testDataVecs = getAvgFeatureVecs(clean_test_essays, model, num_features)

trainDataVecs = np.array(trainDataVecs)
testDataVecs = np.array(testDataVecs)

# Reshape train and test vectors to 3 dimensions (1 represents one timestep)
trainDataVecs = np.reshape(trainDataVecs, (trainDataVecs.shape[0], 1, trainDataVecs.shape[1]))
testDataVecs = np.reshape(testDataVecs, (testDataVecs.shape[0], 1, testDataVecs.shape[1]))

In [27]:
lstm_model = get_model()
lstm_model.fit(trainDataVecs, y_train, batch_size=64, epochs=150)

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_4 (LSTM)               (None, 1, 50)             20200     
                                                                 
 lstm_5 (LSTM)               (None, 64)                29440     
                                                                 
 dropout_2 (Dropout)         (None, 64)                0         
                                                                 
 dense_2 (Dense)             (None, 1)                 65        
                                                                 
Total params: 49,705
Trainable params: 49,705
Non-trainable params: 0
_________________________________________________________________
Epoch 1/150
Epoch 2/150
Epoch 3/150
Epoch 4/150
Epoch 5/150
Epoch 6/150
Epoch 7/150
Epoch 8/150
Epoch 9/150
Epoch 10/150
Epoch 11/150
Epoch 12/150
Epoch 13/150
Epoch 14/150
Epoch 15/

<keras.callbacks.History at 0x25e6c3b2ce0>

In [24]:
from sklearn.metrics import cohen_kappa_score

lstm_model.save('./final_lstm_model50.h5')

y_pred = lstm_model.predict(testDataVecs)
y_pred = np.round(y_pred)

result = cohen_kappa_score(y_test.values, y_pred, weights='quadratic')
print("Kappa Score: {}".format(result))

Kappa Score: 0.9236359418268769


# Using 100d Glove embedding

In [66]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

train_essays = X_train['essay']
test_essays = X_test['essay']

sentences = []

for essay in train_essays:
    sentences += essay_to_sentences(essay, remove_stopwords=True)

num_features = 100     
model = embedding_dict

clean_train_essays = []
for essay_v in train_essays:
    clean_train_essays.append(essay_to_wordlist(essay_v, remove_stopwords=True))
trainDataVecs = getAvgFeatureVecs(clean_train_essays, model, num_features)

clean_test_essays = []
for essay_v in test_essays:
    clean_test_essays.append(essay_to_wordlist(essay_v, remove_stopwords=True))
testDataVecs = getAvgFeatureVecs(clean_test_essays, model, num_features)

trainDataVecs = np.array(trainDataVecs)
testDataVecs = np.array(testDataVecs)

# Reshape train and test vectors to 3 dimensions (1 represents one timestep)
trainDataVecs = np.reshape(trainDataVecs, (trainDataVecs.shape[0], 1, trainDataVecs.shape[1]))
testDataVecs = np.reshape(testDataVecs, (testDataVecs.shape[0], 1, testDataVecs.shape[1]))

In [34]:
lstm_model = get_model()
lstm_model.fit(trainDataVecs, y_train, batch_size=64, epochs=150)

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_6 (LSTM)               (None, 1, 100)            80400     
                                                                 
 lstm_7 (LSTM)               (None, 64)                42240     
                                                                 
 dropout_3 (Dropout)         (None, 64)                0         
                                                                 
 dense_3 (Dense)             (None, 1)                 65        
                                                                 
Total params: 122,705
Trainable params: 122,705
Non-trainable params: 0
_________________________________________________________________
Epoch 1/150
Epoch 2/150
Epoch 3/150
Epoch 4/150
Epoch 5/150
Epoch 6/150
Epoch 7/150
Epoch 8/150
Epoch 9/150
Epoch 10/150
Epoch 11/150
Epoch 12/150
Epoch 13/150
Epoch 14/150
Epoch 1

<keras.callbacks.History at 0x25e6c367340>

In [35]:
from sklearn.metrics import cohen_kappa_score

lstm_model.save('./final_lstm_model100.h5')

y_pred = lstm_model.predict(testDataVecs)
y_pred = np.round(y_pred)

result = cohen_kappa_score(y_test.values, y_pred, weights='quadratic')
print("Kappa Score: {}".format(result))

Kappa Score: 0.9575496052479834


# Using 200d Glove embedding

In [52]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

train_essays = X_train['essay']
test_essays = X_test['essay']

sentences = []

for essay in train_essays:
    sentences += essay_to_sentences(essay, remove_stopwords=True)

num_features = 200     
model = embedding_dict

clean_train_essays = []
for essay_v in train_essays:
    clean_train_essays.append(essay_to_wordlist(essay_v, remove_stopwords=True))
trainDataVecs = getAvgFeatureVecs(clean_train_essays, model, num_features)

clean_test_essays = []
for essay_v in test_essays:
    clean_test_essays.append(essay_to_wordlist(essay_v, remove_stopwords=True))
testDataVecs = getAvgFeatureVecs(clean_test_essays, model, num_features)

trainDataVecs = np.array(trainDataVecs)
testDataVecs = np.array(testDataVecs)

# Reshape train and test vectors to 3 dimensions (1 represents one timestep)
trainDataVecs = np.reshape(trainDataVecs, (trainDataVecs.shape[0], 1, trainDataVecs.shape[1]))
testDataVecs = np.reshape(testDataVecs, (testDataVecs.shape[0], 1, testDataVecs.shape[1]))

In [44]:
lstm_model = get_model()
lstm_model.fit(trainDataVecs, y_train, batch_size=64, epochs=150)

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_8 (LSTM)               (None, 1, 200)            320800    
                                                                 
 lstm_9 (LSTM)               (None, 64)                67840     
                                                                 
 dropout_4 (Dropout)         (None, 64)                0         
                                                                 
 dense_4 (Dense)             (None, 1)                 65        
                                                                 
Total params: 388,705
Trainable params: 388,705
Non-trainable params: 0
_________________________________________________________________
Epoch 1/150
Epoch 2/150
Epoch 3/150
Epoch 4/150
Epoch 5/150
Epoch 6/150
Epoch 7/150
Epoch 8/150
Epoch 9/150
Epoch 10/150
Epoch 11/150
Epoch 12/150
Epoch 13/150
Epoch 14/150
Epoch 1

<keras.callbacks.History at 0x25e6d959570>

In [46]:
from sklearn.metrics import cohen_kappa_score

lstm_model.save('./final_lstm_model200.h5')

y_pred = lstm_model.predict(testDataVecs)
y_pred = np.round(y_pred)

result = cohen_kappa_score(y_test.values, y_pred, weights='quadratic')
print("Kappa Score: {}".format(result))

Kappa Score: 0.9629401012634522
