## Imports

In [62]:
import numpy as np
import pandas as pd
import tensorflow as tf
import nltk
import tensorflow.keras.backend as K
import re

from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize,word_tokenize
from gensim.models import Word2Vec
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Lambda, Flatten
from tensorflow.keras.models import Sequential, load_model, model_from_config
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import cohen_kappa_score

nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

## Mount google drive

In [63]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Preparing data

In [64]:
df = pd.read_csv("/content/drive/MyDrive/Datasets/Hewett's Essay/training_set_rel3.tsv", sep='\t', encoding='ISO-8859-1');
df.dropna(axis=1,inplace=True)
df.drop(columns=['domain1_score','rater1_domain1','rater2_domain1'],inplace=True,axis=1)
df.head()

Unnamed: 0,essay_id,essay_set,essay
0,1,1,"Dear local newspaper, I think effects computer..."
1,2,1,"Dear @CAPS1 @CAPS2, I believe that using compu..."
2,3,1,"Dear, @CAPS1 @CAPS2 @CAPS3 More and more peopl..."
3,4,1,"Dear Local Newspaper, @CAPS1 I have found that..."
4,5,1,"Dear @LOCATION1, I know having computers has a..."


In [65]:
temp = pd.read_csv("/content/drive/MyDrive/Datasets/Hewett's Essay/processed_dataset_2.csv")
temp.drop("Unnamed: 0",inplace=True,axis=1)

In [66]:
df['domain1_score']=temp['final_score']
df.head()

Unnamed: 0,essay_id,essay_set,essay,domain1_score
0,1,1,"Dear local newspaper, I think effects computer...",6
1,2,1,"Dear @CAPS1 @CAPS2, I believe that using compu...",7
2,3,1,"Dear, @CAPS1 @CAPS2 @CAPS3 More and more peopl...",5
3,4,1,"Dear Local Newspaper, @CAPS1 I have found that...",8
4,5,1,"Dear @LOCATION1, I know having computers has a...",6


In [67]:
df['essay'][0]

"Dear local newspaper, I think effects computers have on people are great learning skills/affects because they give us time to chat with friends/new people, helps us learn about the globe(astronomy) and keeps us out of troble! Thing about! Dont you think so? How would you feel if your teenager is always on the phone with friends! Do you ever time to chat with your friends or buisness partner about things. Well now - there's a new way to chat the computer, theirs plenty of sites on the internet to do so: @ORGANIZATION1, @ORGANIZATION2, @CAPS1, facebook, myspace ect. Just think now while your setting up meeting with your boss on the computer, your teenager is having fun on the phone not rushing to get off cause you want to use it. How did you learn about other countrys/states outside of yours? Well I have by computer/internet, it's a new way to learn about what going on in our time! You might think your child spends a lot of time on the computer, but ask them so question about the econom

In [68]:
#Make Dataset
y = df['domain1_score']
df.drop('domain1_score',inplace=True,axis=1)

X=df

In [69]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [70]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(9083, 3)
(9083,)
(3893, 3)
(3893,)


In [71]:
X_train.head()

Unnamed: 0,essay_id,essay_set,essay
12481,21003,8,It's the first day of high school and your gu...
6425,9982,4,The author concludes the story with that parag...
4989,7388,3,Many things that the author talks about in the...
12903,21536,8,"Have you ever had a memorable time, where you..."
10239,16189,6,As a result of the idea to build a mooring mas...


## Data Preprocessing

In [72]:
train_e = X_train['essay'].tolist()
test_e = X_test['essay'].tolist()

train_sents=[]
test_sents=[]

stop_words = set(stopwords.words('english'))

In [73]:
# Converting setence into set of words
def sent2word(x):
    x=re.sub("[^A-Za-z]"," ",x)
    x.lower()
    filtered_sentence = []
    words=x.split()
    for w in words:
        if w not in stop_words:
            filtered_sentence.append(w)
    return filtered_sentence

print(sent2word("I am a boy."))

['I', 'boy']


In [None]:
# Converting essay into words
def essay2word(essay):
    essay = essay.strip()
    tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
    raw = tokenizer.tokenize(essay)
    final_words=[]
    for i in raw:
        if(len(i)>0):
            final_words.append(sent2word(i))
    return final_words

print(essay2word("For many years, I have been interested in studying international relations. My interest in pursuing this field stems from several factors which have affected me. First, I have been exposed to international affairs throughout my life. With my father and two of my brothers in the Saudi Foreign Service, I have grown up under the shadow of inter-national affairs. Second, I am fascinated by history, economics, and diplomacy. I believe, through the study of international relations, I can effectively satisfy my curiosity in these fields. A third factor which has affected my interest in international relations is patriotism. Through the Foreign Service, I would not only have the opportunity to serve my country, but also have the chance to help bridge gaps between my country and others. Finally, as a Saudi living abroad, I have been bridging cultures throughout my life. This experience has taught me to look for differences to compromise and similarities to synthesize in order to balance different cultures. In short, I believe that my experiences in life, combined with a rigorous academic education, will enable me to pursue a successful career in the Saudi Foreign Service."))

[['For', 'many', 'years', 'I', 'interested', 'studying', 'international', 'relations'], ['My', 'interest', 'pursuing', 'field', 'stems', 'several', 'factors', 'affected'], ['First', 'I', 'exposed', 'international', 'affairs', 'throughout', 'life'], ['With', 'father', 'two', 'brothers', 'Saudi', 'Foreign', 'Service', 'I', 'grown', 'shadow', 'inter', 'national', 'affairs'], ['Second', 'I', 'fascinated', 'history', 'economics', 'diplomacy'], ['I', 'believe', 'study', 'international', 'relations', 'I', 'effectively', 'satisfy', 'curiosity', 'fields'], ['A', 'third', 'factor', 'affected', 'interest', 'international', 'relations', 'patriotism'], ['Through', 'Foreign', 'Service', 'I', 'would', 'opportunity', 'serve', 'country', 'also', 'chance', 'help', 'bridge', 'gaps', 'country', 'others'], ['Finally', 'Saudi', 'living', 'abroad', 'I', 'bridging', 'cultures', 'throughout', 'life'], ['This', 'experience', 'taught', 'look', 'differences', 'compromise', 'similarities', 'synthesize', 'order', '

In [None]:
for i in train_e:
    train_sents+=essay2word(i)

for i in test_e:
    test_sents+=essay2word(i)

In [None]:
len(train_sents)

116500

## Training a word2vec model

In [None]:
#Training Word2Vec model
num_features = 300
min_word_count = 40
num_workers = 4
context = 10
downsampling = 1e-3

model = Word2Vec(train_sents,
                 workers=num_workers,
                 vector_size=num_features,
                 min_count = min_word_count,
                 epochs=20,
                 window = context,
                 sample = downsampling)

model.init_sims(replace=True)

  model.init_sims(replace=True)


In [None]:
model.wv.save_word2vec_format("/content/drive/MyDrive/Colab Notebooks/Hewett's Essay/word2vec_model.bin", binary=True)

In [None]:
# Function to make vector from words
def makeVec(words, model, num_features):
    vec = np.zeros((num_features,),dtype="float32")
    noOfWords = 0.
    index2word_set = set(model.wv.index_to_key)
    for i in words:
        if i in index2word_set:
            noOfWords += 1
            vec = np.add(vec,model.wv[i])
    vec = np.divide(vec,noOfWords)
    return vec

vector = makeVec(["I", "am", "a", "boy"], model, 300)
print(vector)
print(vector.shape)

[-4.39859554e-02  4.00146097e-02  1.74685623e-02  4.13536653e-02
 -2.45002229e-02 -4.11658660e-02 -4.56444472e-02  4.77644503e-02
 -4.89481166e-02 -3.75508629e-02  6.27492964e-02 -4.50452790e-02
  2.56170779e-02 -2.06983127e-02  7.67780989e-02  1.82645954e-02
 -3.28961760e-02  1.01339035e-02 -4.74916995e-02 -6.13938197e-02
  5.84007725e-02  3.78347351e-03 -5.41901886e-02 -4.45484966e-02
 -5.19001000e-02  2.78259479e-02  5.95761463e-02  4.76488583e-02
  2.97744237e-02  7.58799352e-03  2.28753202e-02 -6.60892203e-03
  3.60513292e-02 -3.38758491e-02  2.77869161e-02 -3.48846950e-02
  9.84513611e-02 -2.91154776e-02 -4.50489596e-02  3.15144472e-02
  2.27296911e-02  5.68084121e-02  4.61303070e-03  5.01732379e-02
 -6.62066415e-03 -4.97273356e-03  2.25446373e-02  1.08123403e-02
 -6.70910440e-03 -8.58014673e-02 -9.05642211e-02 -1.60601586e-02
  3.47596481e-02 -8.21966119e-03  5.63386939e-02 -2.46221386e-02
 -1.10663474e-04  3.40178721e-02 -6.85928296e-03  7.22627491e-02
  2.92701945e-02  2.04965

In [None]:
# Function to get vectors from an essay
def getVecs(essays, model, num_features):
    c=0
    essay_vecs = np.zeros((len(essays),num_features),dtype="float32")
    for i in essays:
        essay_vecs[c] = makeVec(i, model, num_features)
        c+=1
    return essay_vecs

In [None]:
clean_train=[]
for i in train_e:
    clean_train.append(sent2word(i))

In [None]:
clean_train[0]

['It',
 'first',
 'day',
 'high',
 'school',
 'gut',
 'full',
 'butterflies',
 'make',
 'want',
 'run',
 'bathrooms',
 'hide',
 'world',
 'I',
 'CAPS',
 'friends',
 'yet',
 'I',
 'shy',
 'go',
 'join',
 'groups',
 'I',
 'walked',
 'classes',
 'alone',
 'Students',
 'would',
 'get',
 'face',
 'wanting',
 'know',
 'school',
 'I',
 'came',
 'wanted',
 'know',
 'I',
 'remember',
 'wanting',
 'left',
 'alone',
 'hidden',
 'bubble',
 'space',
 'Girls',
 'would',
 'ask',
 'CAPS',
 'I',
 'upset',
 'since',
 'I',
 'smiling',
 'fun',
 'getting',
 'know',
 'new',
 'classes',
 'Lunch',
 'much',
 'crowded',
 'old',
 'school',
 'Students',
 'would',
 'sit',
 'hallways',
 'eating',
 'lunch',
 'crowd',
 'around',
 'one',
 'big',
 'table',
 'chattering',
 'loudly',
 'one',
 'another',
 'I',
 'chewed',
 'lip',
 'looking',
 'spot',
 'occupied',
 'students',
 'I',
 'found',
 'table',
 'near',
 'front',
 'lunch',
 'room',
 'empty',
 'placed',
 'brown',
 'lunch',
 'sack',
 'front',
 'eating',
 'peanut',
 'b

In [None]:
len(clean_train)

9083

In [None]:
training_vectors = getVecs(clean_train, model, num_features)

In [None]:
training_vectors.shape

(9083, 300)

In [None]:
clean_test=[]

for i in test_e:
    clean_test.append(sent2word(i))
testing_vectors = getVecs(clean_test, model, num_features)

  vec = np.divide(vec,noOfWords)


In [None]:
testing_vectors.shape

(3893, 300)

In [None]:
tv_df = pd.DataFrame(testing_vectors)
tv_df = tv_df.fillna(0) # replacing nan value with 0

In [None]:
tv_df.shape

(3893, 300)

In [None]:
training_vectors = np.array(training_vectors)
testing_vectors = np.array(tv_df)

# Reshaping train and test vectors to 3 dimensions. (1 represnts one timestep)
training_vectors = np.reshape(training_vectors, (training_vectors.shape[0], 1, training_vectors.shape[1]))
testing_vectors = np.reshape(testing_vectors, (testing_vectors.shape[0], 1, testing_vectors.shape[1]))

print(training_vectors.shape)
print(testing_vectors.shape)

(9083, 1, 300)
(3893, 1, 300)


## Training a LSTM model

In [None]:
def get_model():
    model = Sequential()
    model.add(LSTM(300, dropout=0.4, recurrent_dropout=0.4, input_shape=[1, 300], return_sequences=True))
    model.add(Dropout(0.3))
    model.add(LSTM(128, recurrent_dropout=0.4, return_sequences = True))
    model.add(Dropout(0.3))
    model.add(LSTM(64, recurrent_dropout=0.4))
    model.add(Dropout(0.3))
    model.add(Dense(1, activation='relu'))
    model.compile(loss='mean_squared_error', optimizer='rmsprop', metrics=['mae', 'mse'])
    model.summary()
    return model

In [None]:
lstm_model = get_model()



Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm (LSTM)                 (None, 1, 300)            721200    
                                                                 
 dropout (Dropout)           (None, 1, 300)            0         
                                                                 
 lstm_1 (LSTM)               (None, 1, 128)            219648    
                                                                 
 dropout_1 (Dropout)         (None, 1, 128)            0         
                                                                 
 lstm_2 (LSTM)               (None, 64)                49408     
                                                                 
 dropout_2 (Dropout)         (None, 64)                0         
                                                                 
 dense (Dense)               (None, 1)                 6

In [None]:
history = lstm_model.fit(training_vectors, y_train, batch_size=64, epochs=150, validation_data=(testing_vectors, y_test))

Epoch 1/150
Epoch 2/150
Epoch 3/150
Epoch 4/150
Epoch 5/150
Epoch 6/150
Epoch 7/150
Epoch 8/150
Epoch 9/150
Epoch 10/150
Epoch 11/150
Epoch 12/150
Epoch 13/150
Epoch 14/150
Epoch 15/150
Epoch 16/150
Epoch 17/150
Epoch 18/150
Epoch 19/150
Epoch 20/150
Epoch 21/150
Epoch 22/150
Epoch 23/150
Epoch 24/150
Epoch 25/150
Epoch 26/150
Epoch 27/150
Epoch 28/150
Epoch 29/150
Epoch 30/150
Epoch 31/150
Epoch 32/150
Epoch 33/150
Epoch 34/150
Epoch 35/150
Epoch 36/150
Epoch 37/150
Epoch 38/150
Epoch 39/150
Epoch 40/150
Epoch 41/150
Epoch 42/150
Epoch 43/150
Epoch 44/150
Epoch 45/150
Epoch 46/150
Epoch 47/150
Epoch 48/150
Epoch 49/150
Epoch 50/150
Epoch 51/150
Epoch 52/150
Epoch 53/150
Epoch 54/150
Epoch 55/150
Epoch 56/150
Epoch 57/150
Epoch 58/150
Epoch 59/150
Epoch 60/150
Epoch 61/150
Epoch 62/150
Epoch 63/150
Epoch 64/150
Epoch 65/150
Epoch 66/150
Epoch 67/150
Epoch 68/150
Epoch 69/150
Epoch 70/150
Epoch 71/150
Epoch 72/150
Epoch 73/150
Epoch 74/150
Epoch 75/150
Epoch 76/150
Epoch 77/150
Epoch 78

In [None]:
lstm_model.save("/content/drive/MyDrive/Colab Notebooks/Hewett's Essay/simple_lstm.keras")
y_pred = lstm_model.predict(testing_vectors)
y_pred = np.around(y_pred)
y_pred



array([[2.],
       [5.],
       [7.],
       ...,
       [7.],
       [8.],
       [9.]], dtype=float32)

In [93]:
lstm_model.save_weights("/content/drive/MyDrive/Colab Notebooks/Hewett's Essay/simple_lstm.h5")

## Model Evaluation

In [None]:
from sklearn.metrics import cohen_kappa_score
from sklearn.metrics import mean_squared_error

In [None]:
y_pred = np.array(y_pred)
y_pred[np.isnan(y_pred)] = 0

In [None]:
# Check for NaN values in y_test
if y_test.isnull().values.any():
    # Handle NaN values in y_test
    y_test = y_test.fillna(0)  # Replace NaN values with 0 (or any other value)

In [None]:
assert len(y_test) == len(y_pred), "Mismatched lengths between y_test and y_pred"

In [None]:
result = cohen_kappa_score(y_test.values,y_pred, weights='quadratic')
print("Kappa Score: {}".format(result))

Kappa Score: 0.6450452385623828


In [None]:
print("Mean squared error: %.2f" % mean_squared_error(y_test, y_pred))

Mean squared error: 3.39


## Hyperparameters Optimization

In [None]:
!pip install keras-tuner



In [None]:
import keras_tuner
from keras_tuner.tuners import RandomSearch
from keras_tuner.engine.hyperparameters import HyperParameters

In [84]:
def build_model(hp):
    model = Sequential()
    model.add(LSTM(hp.Int('input_unit', min_value=32, max_value=512, step=32), input_shape=[1, 300], return_sequences=True))
    model.add(Dropout(hp.Float('Dropout_rate', min_value=0, max_value=0.5, step=0.1)))
    for i in range(hp.Int('n_layers', 1, 4)):
        model.add(LSTM(hp.Int(f'lstm_{i}_units', min_value=32, max_value=512, step=32), return_sequences=True))
        model.add(Dropout(hp.Float('Dropout_rate', min_value=0, max_value=0.5, step=0.1)))
    model.add(LSTM(hp.Int('layer_2_neurons', min_value=32, max_value=512, step=32)))
    model.add(Dropout(hp.Float('Dropout_rate', min_value=0, max_value=0.5, step=0.1)))
    model.add(Dense(1, activation='relu'))
    model.compile(loss='mean_squared_error', optimizer='adam', metrics = ["mse"])
    return model

In [85]:
tuner= RandomSearch(
        build_model,
        objective='val_mse',
        max_trials=5,
        executions_per_trial=1
        )

In [86]:
tuner.search(
        x=training_vectors,
        y=y_train,
        epochs=150,
        batch_size=128,
        validation_data=(testing_vectors,y_test),
)

Trial 5 Complete [00h 02m 29s]
val_mse: 3.2900352478027344

Best val_mse So Far: 3.2599399089813232
Total elapsed time: 00h 17m 28s


## Getting the best model
After the search has been done (it may take a long time). We are ready to get the best model.

In [87]:
best_model = tuner.get_best_models(num_models=1)[0]

Testing the best model

In [88]:
y_pred_1 = best_model.predict(testing_vectors)
y_pred_1 = np.around(y_pred_1)



In [89]:
result = cohen_kappa_score(y_test.values, y_pred_1, weights='quadratic')
print("Kappa Score: {}".format(result))

Kappa Score: 0.6718185214975547


In [90]:
print("Mean squared error: %.2f" % mean_squared_error(y_test, y_pred_1))

Mean squared error: 3.36


In [92]:
best_model.save("/content/drive/MyDrive/Colab Notebooks/Hewett's Essay/best_lstm.keras")