In [1]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Jayasree\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Jayasree\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
import pandas as pd
import numpy as np
import re
from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split



In [3]:
data= pd.read_csv('./data1/training_set_rel3.tsv', sep='\t', encoding='ISO-8859-1')

In [4]:
data.head()

Unnamed: 0,essay_id,essay_set,essay,rater1_domain1,rater2_domain1,rater3_domain1,domain1_score,rater1_domain2,rater2_domain2,domain2_score,...,rater2_trait3,rater2_trait4,rater2_trait5,rater2_trait6,rater3_trait1,rater3_trait2,rater3_trait3,rater3_trait4,rater3_trait5,rater3_trait6
0,1,1,"Dear local newspaper, I think effects computer...",4,4,,8,,,,...,,,,,,,,,,
1,2,1,"Dear @CAPS1 @CAPS2, I believe that using compu...",5,4,,9,,,,...,,,,,,,,,,
2,3,1,"Dear, @CAPS1 @CAPS2 @CAPS3 More and more peopl...",4,3,,7,,,,...,,,,,,,,,,
3,4,1,"Dear Local Newspaper, @CAPS1 I have found that...",5,5,,10,,,,...,,,,,,,,,,
4,5,1,"Dear @LOCATION1, I know having computers has a...",4,4,,8,,,,...,,,,,,,,,,


In [5]:
data.isnull().any()

essay_id          False
essay_set         False
essay             False
rater1_domain1    False
rater2_domain1    False
rater3_domain1     True
domain1_score     False
rater1_domain2     True
rater2_domain2     True
domain2_score      True
rater1_trait1      True
rater1_trait2      True
rater1_trait3      True
rater1_trait4      True
rater1_trait5      True
rater1_trait6      True
rater2_trait1      True
rater2_trait2      True
rater2_trait3      True
rater2_trait4      True
rater2_trait5      True
rater2_trait6      True
rater3_trait1      True
rater3_trait2      True
rater3_trait3      True
rater3_trait4      True
rater3_trait5      True
rater3_trait6      True
dtype: bool

In [6]:
data= data.dropna(axis=1)
data= data.drop(columns=['rater1_domain1', 'rater2_domain1'])

In [7]:
data.head()

Unnamed: 0,essay_id,essay_set,essay,domain1_score
0,1,1,"Dear local newspaper, I think effects computer...",8
1,2,1,"Dear @CAPS1 @CAPS2, I believe that using compu...",9
2,3,1,"Dear, @CAPS1 @CAPS2 @CAPS3 More and more peopl...",7
3,4,1,"Dear Local Newspaper, @CAPS1 I have found that...",10
4,5,1,"Dear @LOCATION1, I know having computers has a...",8


In [8]:
x=data.iloc[:,0:3]
y=data.iloc[:,3]

In [9]:
x

Unnamed: 0,essay_id,essay_set,essay
0,1,1,"Dear local newspaper, I think effects computer..."
1,2,1,"Dear @CAPS1 @CAPS2, I believe that using compu..."
2,3,1,"Dear, @CAPS1 @CAPS2 @CAPS3 More and more peopl..."
3,4,1,"Dear Local Newspaper, @CAPS1 I have found that..."
4,5,1,"Dear @LOCATION1, I know having computers has a..."
5,6,1,"Dear @LOCATION1, I think that computers have a..."
6,7,1,Did you know that more and more people these d...
7,8,1,@PERCENT1 of people agree that computers make ...
8,9,1,"Dear reader, @ORGANIZATION1 has had a dramatic..."
9,10,1,In the @LOCATION1 we have the technology of a ...


In [10]:
y

0         8
1         9
2         7
3        10
4         8
5         8
6        10
7        10
8         9
9         9
10        8
11        8
12        7
13        6
14        6
15       12
16        8
17        8
18        4
19        6
20        8
21        3
22       10
23       11
24        8
25        9
26        4
27        9
28        9
29        8
         ..
12946    40
12947    32
12948    36
12949    31
12950    30
12951    47
12952    40
12953    35
12954    33
12955    36
12956    36
12957    48
12958    40
12959    40
12960    40
12961    42
12962    40
12963    32
12964    36
12965    40
12966    10
12967    33
12968    44
12969    35
12970    30
12971    35
12972    32
12973    40
12974    40
12975    40
Name: domain1_score, Length: 12976, dtype: int64

In [11]:
min_scores = [-1, 2, 1, 0, 0, 0, 0, 0, 0]
max_scores = [-1, 12, 6, 3, 3, 4, 4, 30, 60]

# preprocessing

In [12]:
#removing the extra characters other than alphabets and stopwords and tokenizing the words
def essay_to_wordlist(essay_v):
    essay_v = re.sub("[^a-zA-Z]", " ", essay_v)
    words = essay_v.lower().split()
    stops = set(stopwords.words("english"))
    words = [w for w in words if not w in stops]
    return (words)

#Tokenize the senstences and call essay_to_wordlist() for word tokenization.
def essay_to_sentences(essay_v):
    tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
    raw_sentences = tokenizer.tokenize(essay_v.strip())
    sentences = []
    for raw_sentence in raw_sentences:
        if len(raw_sentence) > 0:
            sentences.append(essay_to_wordlist(raw_sentence))
    return sentences


In [60]:
#Feature vector is made from the words list of an essay.
def makeFeatureVec(words, model, num_features):
    featureVec = np.zeros((num_features,),dtype="float32")
    num_words = 0.
    index2word_set = set(model.wv.index_to_key)
    for word in words:
        if word in index2word_set:
            num_words += 1
            featureVec = np.add(featureVec,model.wv[word])        
    featureVec = np.divide(featureVec,num_words)
    return featureVec

#Word vectors are generated for Word2Vec model
def getAvgFeatureVecs(essays, model, num_features):
    counter = 0
    essayFeatureVecs = np.zeros((len(essays),num_features),dtype="float32")
    for essay in essays:
        essayFeatureVecs[counter] = makeFeatureVec(essay, model, num_features)
        counter = counter + 1
    return essayFeatureVecs

In [61]:
#the dataset is split to training and testing sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state = 0)

In [62]:
x_train.shape

(10380, 3)

In [63]:
x_test.shape

(2596, 3)

In [64]:
train_essays = x_train['essay']
test_essays = x_test['essay']

In [65]:
sentences = []
# Obtaining all sentences from the training essays.
for essay in train_essays:
    sentences += essay_to_sentences(essay)

In [66]:
sentences

[['essay', 'author', 'talking', 'journey'],
 ['shows',
  'give',
  'ends',
  'almost',
  'way',
  'began',
  'shows',
  'people',
  'stay',
  'attached',
  'past',
  'reasons',
  'believe',
  'author',
  'concludes',
  'story',
  'paragraph'],
 ['shows',
  'people',
  'give',
  'even',
  'though',
  'failed',
  'test',
  'try',
  'later',
  'pass'],
 ['also',
  'story',
  'ends',
  'way',
  'begins',
  'taking',
  'test',
  'getting',
  'ready',
  'good',
  'way',
  'writing'],
 ['one',
  'last',
  'thing',
  'shows',
  'people',
  'become',
  'attached',
  'past',
  'look',
  'one',
  'thing',
  'reminds',
  'month',
  'fall',
  'apart',
  'takes',
  'test',
  'see',
  'look',
  'familiar',
  'flowers',
  'without',
  'falling',
  'apart'],
 ['story',
  'winter',
  'hibiscus',
  'minfong',
  'ho',
  'think',
  'ending',
  'paragraph',
  'show',
  'saeng',
  'given'],
 ['says', 'come', 'back', 'take', 'test', 'talking', 'hibiscus', 'geese'],
 ['things', 'remind', 'home'],
 ['reminders'

In [67]:
#build the vectorizer with maximum featuroes of 300
num_features = 300 
min_word_count = 40
num_workers = 4
context = 10
downsampling = 1e-3
model = Word2Vec(sentences, workers=num_workers, vector_size=num_features, min_count = min_word_count, window = context, 
                 sample = downsampling)             

In [68]:
#save the vectorizer in .bin file
model.wv.save_word2vec_format('word2vecmodel.bin', binary=True)

In [69]:
#get the training vectors
clean_train_essays = []
for essay_v in train_essays:
        clean_train_essays.append(essay_to_wordlist(essay_v))
trainDataVecs = getAvgFeatureVecs(clean_train_essays, model, num_features)
 
#get the testing vectors
clean_test_essays = []
for essay_v in test_essays:
    clean_test_essays.append(essay_to_wordlist( essay_v))
testDataVecs = getAvgFeatureVecs( clean_test_essays, model, num_features )
    
#convert the vectors to numpy array
trainDataVecs = np.array(trainDataVecs)
testDataVecs = np.array(testDataVecs)

# Reshaping train and test vectors to 3 dimensions. (1 represnts one timestep)
trainDataVecs = np.reshape(trainDataVecs, (trainDataVecs.shape[0], 1, trainDataVecs.shape[1]))
testDataVecs = np.reshape(testDataVecs, (testDataVecs.shape[0], 1, testDataVecs.shape[1]))

# model building

In [70]:
from keras.layers import Embedding, LSTM, Dense, Dropout, Lambda, Flatten
from keras.models import Sequential, load_model, model_from_config
import keras.backend as K

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [71]:
model = Sequential()
model.add(LSTM(300, dropout=0.4, recurrent_dropout=0.4, input_shape=[1, 300], return_sequences=True))
model.add(LSTM(64, recurrent_dropout=0.4))
model.add(Dropout(0.5))
model.add(Dense(1, activation='relu'))





Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


In [72]:
model.compile(loss='mean_squared_error', optimizer='rmsprop', metrics=['mae'])




In [None]:
model.fit(trainDataVecs, y_train, batch_size=64, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10

In [102]:
testDataVecs.shape

(2596, 1, 300)

In [103]:
y_pred = model.predict(testDataVecs)

In [104]:
y_pred

array([[11.495794 ],
       [17.46849  ],
       [ 2.9602134],
       ...,
       [ 2.002942 ],
       [ 2.7216907],
       [ 2.8064475]], dtype=float32)

In [105]:
model.save('final_lstm.h5')

In [106]:
from sklearn.metrics import r2_score
accuracy = r2_score(y_test,y_pred)
accuracy

0.9408386107051878