In [15]:
import nltk
import statistics
import numpy as np
import pickle as pkl
import autocorrect
from spellchecker import SpellChecker
from sklearn.metrics import cohen_kappa_score
import pandas as pd

In [17]:
DATASET_DIR = './data/'

In [21]:
X = pd.read_csv((DATASET_DIR + 'training_set_rel3.tsv'), sep='\t', encoding='ISO-8859-1')
y = X['domain1_score']
X = X.dropna(axis=1)
X = X.drop(columns=['essay_set','rater1_domain1', 'rater2_domain1'])

In [120]:
X1 = pd.read_csv((DATASET_DIR + 'training_set_rel3.tsv'), sep='\t', encoding='ISO-8859-1')
X1 = X1.dropna(axis=1)
X1 = X1.drop(columns=['rater1_domain1', 'rater2_domain1'])
X1 = X1[X1['essay_set'] == 1]
y1 = X1['domain1_score']
X1.shape

(1783, 4)

In [2]:
def loadGloveModel(gloveFile):
    print("Loading Glove Model")
    f = open(gloveFile,'r',encoding="utf8")
    model = {}
    for line in f:
        try:
            splitLine = line.split()
            word = splitLine[0]
            embedding = np.array([float(val) for val in splitLine[1:]])
            model[word] = embedding
        except:
            print(word)
    print("Done.",len(model)," words loaded!")
    return model

In [3]:
glove_embeddings = loadGloveModel('glove.840B.300d.txt')

Loading Glove Model
.
at
.
to
.
.
email
or
contact
Email
on
At
by
in
emailing
Contact
at
•
at
is
Done. 2195884  words loaded!


In [41]:
glove_embeddings['']

array([ 3.5233e-01,  2.8119e-01,  1.4315e-01,  3.9110e-01, -1.2352e-01,
       -3.0786e-02,  6.3167e-01, -3.5628e-01,  3.6686e-01, -1.7126e+00,
        6.6194e-01, -5.4744e-01, -5.0363e-01, -6.2587e-02,  1.1633e-01,
        5.0001e-03, -4.5603e-02, -8.6685e-01,  4.2480e-01,  3.0265e-01,
       -1.4334e-01, -3.5555e-01,  5.8301e-01,  6.7003e-01,  5.7672e-01,
       -5.3449e-02, -9.4340e-01,  2.6500e-01, -5.4139e-01,  5.3269e-01,
       -4.1811e-01, -3.4529e-01,  1.3248e-01, -6.1739e-01, -9.9286e-01,
        2.9506e-01,  4.9432e-01,  5.1843e-01, -2.8002e-01, -5.7088e-03,
       -7.1925e-02, -5.9534e-01, -2.1579e-01, -3.4581e-01,  4.1651e-03,
        2.3076e-01,  4.4959e-01, -5.0750e-01,  1.0346e+00,  3.0056e-01,
       -2.3742e-02,  1.0007e-01,  7.3591e-01,  6.0800e-01,  6.6588e-01,
       -4.0153e-01, -1.8940e-01,  3.1168e-01, -6.8257e-01,  4.2936e-01,
       -1.1959e+00, -7.0066e-01, -1.4447e-01, -4.3619e-01, -8.7539e-02,
        3.4957e-01, -1.2967e-01,  4.0485e-01, -4.9370e-01,  8.01

In [None]:
VOCAB = []
SENTENCES = []
TOKENS = []

In [None]:
def word2vec(sent, stop_words=False):
    w2v = load_w2v()
    w2v_mean = np.mean(list(w2v.values()))
    w2v_std = np.std(list(w2v.values()))
    embed_size = 300

    embeddings = np.random.normal(w2v_mean, w2v_std, (len(sent), embed_size))
    for i in range(0, len(sent)):
        tokens = tokenize_words(sent[i], remove_stop_words=stop_words)[0]
        count = 0
        for token in tokens:
            if token in w2v:
                count += 1
                embeddings[i] += w2v[token]
        embeddings[i] /= count
    return embeddings

In [102]:
# returns list of misspelled words, corrected errors and the total number of misspelled words
def typos(words):
    spell = SpellChecker()

    # find those words that may be misspelled
    misspelled = spell.unknown(words)
    corrected_words = {}
    for word in misspelled:
        # Get the one `most likely` answer
        correct_spell = spell.correction(word)
        if correct_spell != word:
            corrected_words[word] = correct_spell

    misspelled = [word for word in corrected_words]
    corrected_words = [corrected_words[word] for word in corrected_words]
    return misspelled, corrected_words, len(misspelled)
def spelling_errors(data):
    words = tokenize_words(data)[0]
#     print(words)
    #0.7 because a lot of words are correct but marked as spelling errors
    return int(typos(words)[2]*0.7)

In [57]:
# returns the list of stop words
def get_stop_words():
    stop_words = nltk.corpus.stopwords.words('english')
    return stop_words

In [70]:
# returns all sentences and total number of sentences
def tokenize_sentences(data):
    sent_token = nltk.tokenize.sent_tokenize(data)
    return sent_token, len(sent_token)
def sent_count(data):
    return tokenize_sentences(data)[1]

In [68]:
# returns all tokens, all types and total number of tokens
# if punc is False(default True), then punctuations are not removed
# If lower is False(default True), then words are not kept as is, not converted to lower case
def tokenize_words(data, punc=True, remove_stop_words=False, lower=True):
    if lower:
        word_tokens = nltk.tokenize.word_tokenize(data.lower())
    else:
        word_tokens = nltk.tokenize.word_tokenize(data)
    if punc:
        word_tokens = [word for word in word_tokens if word.isalnum()]
    if remove_stop_words:
        stop_words = get_stop_words()
        word_tokens = [word for word in word_tokens if word not in stop_words]
        
    return word_tokens, list(set(word_tokens)), len(word_tokens)

def word_count(data):
    return tokenize_words(data)[2]

In [None]:
# returns frequency of each word
def token_frequency(words):
    frequency_of_tokens = nltk.FreqDist(words)
    return frequency_of_tokens

In [85]:
# returns length of each word in token
def token_length(data):
    words = tokenize_words(data)[0]
    len_of_tokens = {}
    for word in words:
        len_of_tokens[word] = len(word)
    return len_of_tokens
def avg_length(words):
    return statistics.mean(token_length(words).values())

In [None]:
# returns length of sentences in terms of number of words and the average length of sentences
def sentence_length(sent):
    len_of_sent = []
    for s in sent:
        len_of_sent.append(tokenize_words(s)[1])
    return len_of_sent, statistics.mean(len_of_sent)

In [None]:
def syntactic_correctness(sent):
    print(sent)
    rd_parser = nltk.RecursiveDescentParser(sent)
    print(rd_parser)
    for tree in rd_parser:
        print(tree)

In [51]:
def char_count(data):
    return len(data.lower().replace(' ',''))

In [90]:
def pos_tags(data):
    sent = tokenize_sentences(data)[0]
    
    noun_count = 0
    adj_count = 0
    verb_count = 0
    adv_count = 0
    
    for s in sent:
        tags = nltk.pos_tag(tokenize_words(s)[0])
#         print(tags)
        for tag in tags:
            if tag[1][0] == 'N':
                noun_count += 1
            elif tag[1][0] == 'J':
                adj_count += 1
            elif tag[1][0] == 'V':
                verb_count += 1
            elif tag[1][0] == 'R':
                adv_count += 1
    return noun_count,adj_count,verb_count,adv_count

In [None]:
# For this project, only essay set 1 was used for analysis and model creation.

# Features:
# 1. Bag of Words (BOW) counts (10000 words with maximum frequency)

In [312]:
# 2. Number of characters in an essay
# X['num_chars'] = X['essay'].apply(char_count)
X1['num_chars'] = X1['essay'].apply(char_count)
# X.head()
X1.head()

Unnamed: 0,essay_id,essay_set,essay,domain1_score,num_chars,num_words,num_sents,avg_word_length,noun_count,adj_count,verb_count,adv_count
0,1,1,"Dear local newspaper, I think effects computer...",8,1538,-0.277644,-0.746351,-0.007471,-0.532164,-0.708112,-0.204692,0.065043
1,2,1,"Dear @CAPS1 @CAPS2, I believe that using compu...",9,1870,0.453147,-0.305963,-0.027029,0.394898,-0.263516,0.432873,-0.589108
2,3,1,"Dear, @CAPS1 @CAPS2 @CAPS3 More and more peopl...",7,1263,-0.714439,-0.966546,-0.131474,-0.377654,-0.441354,-0.842257,-0.869458
3,4,1,"Dear Local Newspaper, @CAPS1 I have found that...",10,2642,1.309936,0.464717,2.160992,2.249023,1.24811,0.911046,0.812644
4,5,1,"Dear @LOCATION1, I know having computers has a...",8,2105,0.822742,0.795009,0.333307,0.456703,0.358918,0.592264,1.092995


In [313]:
# 3. Number of words in an essay
# X['num_words'] = X['essay'].apply(word_count)
# X.head()
X1['num_words'] = X1['essay'].apply(word_count)
X1.head()

Unnamed: 0,essay_id,essay_set,essay,domain1_score,num_chars,num_words,num_sents,avg_word_length,noun_count,adj_count,verb_count,adv_count
0,1,1,"Dear local newspaper, I think effects computer...",8,1538,331,-0.746351,-0.007471,-0.532164,-0.708112,-0.204692,0.065043
1,2,1,"Dear @CAPS1 @CAPS2, I believe that using compu...",9,1870,418,-0.305963,-0.027029,0.394898,-0.263516,0.432873,-0.589108
2,3,1,"Dear, @CAPS1 @CAPS2 @CAPS3 More and more peopl...",7,1263,279,-0.966546,-0.131474,-0.377654,-0.441354,-0.842257,-0.869458
3,4,1,"Dear Local Newspaper, @CAPS1 I have found that...",10,2642,520,0.464717,2.160992,2.249023,1.24811,0.911046,0.812644
4,5,1,"Dear @LOCATION1, I know having computers has a...",8,2105,462,0.795009,0.333307,0.456703,0.358918,0.592264,1.092995


In [314]:
# 4. Number of sentences in an essay
# X['num_sents'] = X['essay'].apply(sent_count)
# print(X.head())
X1['num_sents'] = X1['essay'].apply(sent_count)
X1.head()

Unnamed: 0,essay_id,essay_set,essay,domain1_score,num_chars,num_words,num_sents,avg_word_length,noun_count,adj_count,verb_count,adv_count
0,1,1,"Dear local newspaper, I think effects computer...",8,1538,331,16,-0.007471,-0.532164,-0.708112,-0.204692,0.065043
1,2,1,"Dear @CAPS1 @CAPS2, I believe that using compu...",9,1870,418,20,-0.027029,0.394898,-0.263516,0.432873,-0.589108
2,3,1,"Dear, @CAPS1 @CAPS2 @CAPS3 More and more peopl...",7,1263,279,14,-0.131474,-0.377654,-0.441354,-0.842257,-0.869458
3,4,1,"Dear Local Newspaper, @CAPS1 I have found that...",10,2642,520,27,2.160992,2.249023,1.24811,0.911046,0.812644
4,5,1,"Dear @LOCATION1, I know having computers has a...",8,2105,462,30,0.333307,0.456703,0.358918,0.592264,1.092995


In [315]:
# 5. Average word length of an essay
# X['avg_word_length'] = X['essay'].apply(avg_length)
# print(X.head())
X1['avg_word_length'] = X1['essay'].apply(avg_length)
X1.head()

Unnamed: 0,essay_id,essay_set,essay,domain1_score,num_chars,num_words,num_sents,avg_word_length,noun_count,adj_count,verb_count,adv_count
0,1,1,"Dear local newspaper, I think effects computer...",8,1538,331,16,5.006369,-0.532164,-0.708112,-0.204692,0.065043
1,2,1,"Dear @CAPS1 @CAPS2, I believe that using compu...",9,1870,418,20,5.0,0.394898,-0.263516,0.432873,-0.589108
2,3,1,"Dear, @CAPS1 @CAPS2 @CAPS3 More and more peopl...",7,1263,279,14,4.965986,-0.377654,-0.441354,-0.842257,-0.869458
3,4,1,"Dear Local Newspaper, @CAPS1 I have found that...",10,2642,520,27,5.712551,2.249023,1.24811,0.911046,0.812644
4,5,1,"Dear @LOCATION1, I know having computers has a...",8,2105,462,30,5.117347,0.456703,0.358918,0.592264,1.092995


In [None]:
# 6. Number of lemmas in an essay

In [125]:
# 7. Number of spelling errors in an essay
# X['spelling_erros'] = X['essay'].apply(spelling_errors)
# X.head()
X1['spelling_erros'] = X1['essay'].apply(spelling_errors)
X1.head()

KeyboardInterrupt: 

In [316]:
# 8. Number of nouns in an essay
# 9. Number of adjectives in an essay
# 10. Number of verbs in an essay
# 11. Number of adverbs in an essay
# X['noun_count'], X['adj_count'], X['verb_count'], X['adv_count'] = zip(*X['essay'].map(pos_tags))
# X.head()
X1['noun_count'], X1['adj_count'], X1['verb_count'], X1['adv_count'] = zip(*X1['essay'].map(pos_tags))
X1.head()

Unnamed: 0,essay_id,essay_set,essay,domain1_score,num_chars,num_words,num_sents,avg_word_length,noun_count,adj_count,verb_count,adv_count
0,1,1,"Dear local newspaper, I think effects computer...",8,1538,331,16,5.006369,74,19,69,24
1,2,1,"Dear @CAPS1 @CAPS2, I believe that using compu...",9,1870,418,20,5.0,104,24,85,17
2,3,1,"Dear, @CAPS1 @CAPS2 @CAPS3 More and more peopl...",7,1263,279,14,4.965986,79,22,53,14
3,4,1,"Dear Local Newspaper, @CAPS1 I have found that...",10,2642,520,27,5.712551,164,41,97,32
4,5,1,"Dear @LOCATION1, I know having computers has a...",8,2105,462,30,5.117347,106,31,89,35


In [127]:
X1.head

<bound method NDFrame.head of       essay_id  essay_set                                              essay  \
0            1          1  Dear local newspaper, I think effects computer...   
1            2          1  Dear @CAPS1 @CAPS2, I believe that using compu...   
2            3          1  Dear, @CAPS1 @CAPS2 @CAPS3 More and more peopl...   
3            4          1  Dear Local Newspaper, @CAPS1 I have found that...   
4            5          1  Dear @LOCATION1, I know having computers has a...   
...        ...        ...                                                ...   
1778      1783          1  Dear @CAPS1, @CAPS2 several reasons on way I t...   
1779      1784          1  Do a adults and kids spend to much time on the...   
1780      1785          1  My opinion is that people should have computer...   
1781      1786          1  Dear readers, I think that its good and bad to...   
1782      1787          1  Dear - Local Newspaper I agree thats computers...   

      dom

In [195]:
import tensorflow as tf
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Lambda, Flatten
from tensorflow.keras.models import Sequential, load_model, model_from_config
import tensorflow.keras.backend as K

def get_model(max_len):
    """Define the model."""
#     model = Sequential()
#     model.add(LSTM(300, dropout=0.4, recurrent_dropout=0.4, input_shape=300, return_sequences=True))
#     model.add(LSTM(64, recurrent_dropout=0.4))
#     model.add(Dropout(0.5))
#     model.add(Dense(1, activation='relu'))

#     model = tf.keras.Sequential()
#     # Add an Embedding layer expecting input vocab of size 1000, and
#     # output embedding dimension of size 64.
#     model.add(Embedding(input_dim=300, output_dim=64,trainable=False))

#     # Add a LSTM layer with 128 internal units.
#     model.add(LSTM(128))

#     # Add a Dense layer with 10 units and softmax activation.
#     model.add(Dense(1, activation='relu'))

#     model.summary()
#     model.compile(loss='mean_squared_error', optimizer='rmsprop', metrics=['mae'])



    # Input layer, which takes in as input a vector of size maxlen (150)
    # this is the sequence of token ranks we have previously seen
    inp = tf.keras.layers.Input(shape=(max_len,))

    # This is the embedding layer that maps input token rank sequence to a sequence of vectors
    # We can also fine-tune the embeddings by changing the trainable parameter to True. However,
    # That means we need to compute an additional 3 million parameters and it's best to set it to False
    # in this case for faster training
    x = tf.keras.layers.Embedding(300, 300, trainable=False)(inp)

    # This is the main LSTM layer with 64 units
    x = tf.keras.layers.LSTM(64)(x)

    # A dense layer, just like that in neural networks with ReLU activation
#     x = tf.keras.layers.Dense(16, activation="relu")(x)

    # A dense layer with a single unit with sigmoid activation
    # The sigmoid function ensures that the output is between 0 and 1 (proability)
    x = tf.keras.layers.Dense(1, activation="relu")(x)

    # Set up the model with input and output layers
    model = tf.keras.Model(inputs=inp, outputs=x)

    # Compile the model for faster computation and specify the loss and optimizer
    # We can also specify a list of metrics we want to monitor, here we only care about accuracy
    model.compile(loss='mean_squared_error', optimizer='rmsprop', metrics=['mae'])

    # Model summary provides the list of layers, the output shape of each layer and the number of parameters
    # Total params is the number of model parameters
    # Trainable params are the parameters changed during training
    # Non-trainable params are model parameters which are not updated during training
    model.summary()
    return model

In [275]:
import tensorflow as tf
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Lambda, Flatten
from tensorflow.keras.models import Sequential, load_model, model_from_config
import tensorflow.keras.backend as K

def get_model_nn():
    model = tf.keras.Sequential()
    model.add(tf.keras.layers.Dense(128, input_dim=300, activation='sigmoid'))
    model.add(tf.keras.layers.Dense(64, activation='sigmoid'))
    model.add(tf.keras.layers.Dense(24, activation='sigmoid'))
    model.add(tf.keras.layers.Dense(1, activation='relu'))
    print(model.summary())
    model.compile(optimizer='rmsprop', loss='mean_squared_error', metrics=['accuracy'])
    return model

In [242]:
X1_train = X1.loc[:int(len(X1.index)*0.8)]
y1_train = X1_train['domain1_score']
X1_val = X1.loc[int(len(X1.index)*0.8)+1:int(len(X1.index)*0.9)]
y1_val = X1_val['domain1_score']
X1_test = X1.loc[int(len(X1.index)*0.9)+1:]
y1_test = X1_test['domain1_score']

In [241]:
print(X1.shape)
print(X1_train.shape)
print(X1_val.shape)
print(X1_test.shape)

(1783, 8)
(1427, 8)
(178, 8)
(178, 8)


In [157]:
embedding_matrix = np.zeros((len(X1.index), 300))
print(embedding_matrix.shape)
for i in range(0, len(X1.index)):
    tokens = tokenize_words(X1.loc[i]['essay'])
    count = 0
    for token in tokens[1]:
        try:
            embedding_matrix[i] = np.add(embedding_matrix[i], glove_embeddings[token])
            count += 1
        except KeyError:
            pass
    embedding_matrix[i] = np.divide(embedding_matrix[i], count)

(1783, 300)


In [243]:
embedding_matrix_train = embedding_matrix[:len(X1_train.index)]
embedding_matrix_val = embedding_matrix[len(X1_train.index):len(X1_train.index) + len(X1_val.index)]
embedding_matrix_test = embedding_matrix[len(X1_train.index) + len(X1_val.index):]

In [244]:
print(embedding_matrix_test.shape)
print(embedding_matrix_train.shape)
print(embedding_matrix_val.shape)

(178, 300)
(1427, 300)
(178, 300)


In [293]:
lstm_model = get_model_nn()

Model: "sequential_22"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_62 (Dense)             (None, 128)               38528     
_________________________________________________________________
dense_63 (Dense)             (None, 64)                8256      
_________________________________________________________________
dense_64 (Dense)             (None, 24)                1560      
_________________________________________________________________
dense_65 (Dense)             (None, 1)                 25        
Total params: 48,369
Trainable params: 48,369
Non-trainable params: 0
_________________________________________________________________
None


In [294]:
print()
print(embedding_matrix_train.shape)
# print(type(y1_train))
y1_train = np.asarray(y1_train)
y1_val = np.asarray(y1_val)
lstm_model.fit(embedding_matrix_train, y1_train, epochs=500,validation_data=(embedding_matrix_val,y1_val))


(1427, 300)
Train on 1427 samples, validate on 178 samples
Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
Epoch 20/500
Epoch 21/500
Epoch 22/500
Epoch 23/500
Epoch 24/500
Epoch 25/500
Epoch 26/500
Epoch 27/500
Epoch 28/500
Epoch 29/500
Epoch 30/500
Epoch 31/500
Epoch 32/500
Epoch 33/500
Epoch 34/500
Epoch 35/500
Epoch 36/500
Epoch 37/500
Epoch 38/500
Epoch 39/500
Epoch 40/500
Epoch 41/500
Epoch 42/500
Epoch 43/500
Epoch 44/500
Epoch 45/500
Epoch 46/500
Epoch 47/500
Epoch 48/500
Epoch 49/500
Epoch 50/500
Epoch 51/500
Epoch 52/500
Epoch 53/500
Epoch 54/500
Epoch 55/500
Epoch 56/500
Epoch 57/500
Epoch 58/500
Epoch 59/500
Epoch 60/500
Epoch 61/500
Epoch 62/500
Epoch 63/500
Epoch 64/500
Epoch 65/500
Epoch 66/500
Epoch 67/500
Epoch 68/500
Epoch 69/500
Epoch 70/500
Epoch 71/500
Epoch 72/500
Epoch 73/500


<tensorflow.python.keras.callbacks.History at 0x20597bbf148>

In [295]:
results = lstm_model.predict(embedding_matrix_test).flatten()
results = [int(r) for r in results]

In [296]:
from sklearn.metrics import cohen_kappa_score
# print(results)
print(X1.shape)
print(y1_train.shape)
print(y1_test[1:])
print(len(results))
percent = cohen_kappa_score(y1_test.values,results,weights='quadratic')
print(percent)

(1783, 8)
(1427,)
1606    8
1607    8
1608    9
1609    9
1610    9
       ..
1778    8
1779    7
1780    8
1781    2
1782    7
Name: domain1_score, Length: 177, dtype: int64
178
0.7360312412463543


In [317]:
print(X1.head)
def normalize(data,mean,std):
    return ((data-mean)/std)
X1_norm = X1.copy(deep=True)
X1_norm['num_chars'] = X1_norm['num_chars'].apply(normalize,args=[X1.mean(axis=0)['num_chars'],X1.std(axis=0)['num_chars']])
X1_norm['num_words'] = X1_norm['num_words'].apply(normalize,args=[X1.mean(axis=0)['num_words'],X1.std(axis=0)['num_words']])
X1_norm['num_sents'] = X1_norm['num_sents'].apply(normalize,args=[X1.mean(axis=0)['num_sents'],X1.std(axis=0)['num_sents']])
X1_norm['avg_word_length'] = X1_norm['avg_word_length'].apply(normalize,args=[X1.mean(axis=0)['avg_word_length'],X1.std(axis=0)['avg_word_length']])
X1_norm['noun_count'] = X1_norm['noun_count'].apply(normalize,args=[X1.mean(axis=0)['noun_count'],X1.std(axis=0)['noun_count']])
X1_norm['adj_count'] = X1_norm['adj_count'].apply(normalize,args=[X1.mean(axis=0)['adj_count'],X1.std(axis=0)['adj_count']])
X1_norm['verb_count'] = X1_norm['verb_count'].apply(normalize,args=[X1.mean(axis=0)['verb_count'],X1.std(axis=0)['verb_count']])
X1_norm['adv_count'] = X1_norm['adv_count'].apply(normalize,args=[X1.mean(axis=0)['adv_count'],X1.std(axis=0)['adv_count']])
print(X1_norm.head())
# print(X1.mean(axis=0))
# print(X1.mean(axis=0)['num_chars'])
# print(X1.std(axis=0))

<bound method NDFrame.head of       essay_id  essay_set                                              essay  \
0            1          1  Dear local newspaper, I think effects computer...   
1            2          1  Dear @CAPS1 @CAPS2, I believe that using compu...   
2            3          1  Dear, @CAPS1 @CAPS2 @CAPS3 More and more peopl...   
3            4          1  Dear Local Newspaper, @CAPS1 I have found that...   
4            5          1  Dear @LOCATION1, I know having computers has a...   
...        ...        ...                                                ...   
1778      1783          1  Dear @CAPS1, @CAPS2 several reasons on way I t...   
1779      1784          1  Do a adults and kids spend to much time on the...   
1780      1785          1  My opinion is that people should have computer...   
1781      1786          1  Dear readers, I think that its good and bad to...   
1782      1787          1  Dear - Local Newspaper I agree thats computers...   

      dom

In [318]:
print(X1.head)

<bound method NDFrame.head of       essay_id  essay_set                                              essay  \
0            1          1  Dear local newspaper, I think effects computer...   
1            2          1  Dear @CAPS1 @CAPS2, I believe that using compu...   
2            3          1  Dear, @CAPS1 @CAPS2 @CAPS3 More and more peopl...   
3            4          1  Dear Local Newspaper, @CAPS1 I have found that...   
4            5          1  Dear @LOCATION1, I know having computers has a...   
...        ...        ...                                                ...   
1778      1783          1  Dear @CAPS1, @CAPS2 several reasons on way I t...   
1779      1784          1  Do a adults and kids spend to much time on the...   
1780      1785          1  My opinion is that people should have computer...   
1781      1786          1  Dear readers, I think that its good and bad to...   
1782      1787          1  Dear - Local Newspaper I agree thats computers...   

      dom

In [352]:
import tensorflow as tf
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Lambda, Flatten
from tensorflow.keras.models import Sequential, load_model, model_from_config
import tensorflow.keras.backend as K

def get_features_model_nn():
    model = tf.keras.Sequential()
    model.add(tf.keras.layers.Dense(4, input_dim=8, activation='tanh'))
    model.add(tf.keras.layers.Dense(1, activation='relu'))
    print(model.summary())
    model.compile(optimizer='adam', loss='mean_squared_error', metrics=['accuracy'])
    return model

In [326]:
features = X1_norm[['num_chars','num_words','num_sents','avg_word_length','noun_count','adj_count','verb_count','adv_count','domain1_score']]
# print(features.head)
features_array = np.asarray(features)
features_array_train = features_array[:1427]
features_array_val = features_array[1427:1427+178]
features_array_test = features_array[1427+178:]
print(features_array_test.shape)
print(features_array_val.shape)
print(features_array_train.shape)

(178, 9)
(178, 9)
(1427, 9)


In [359]:
features_model = get_features_model_nn()
features_model.fit(features_array_train[:,:-1], features_array_train[:,-1], epochs=800,validation_data=(features_array_val[:,:-1],features_array_val[:,-1]))

Model: "sequential_37"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_97 (Dense)             (None, 4)                 36        
_________________________________________________________________
dense_98 (Dense)             (None, 1)                 5         
Total params: 41
Trainable params: 41
Non-trainable params: 0
_________________________________________________________________
None
Train on 1427 samples, validate on 178 samples
Epoch 1/800
Epoch 2/800
Epoch 3/800
Epoch 4/800
Epoch 5/800
Epoch 6/800
Epoch 7/800
Epoch 8/800
Epoch 9/800
Epoch 10/800
Epoch 11/800
Epoch 12/800
Epoch 13/800
Epoch 14/800
Epoch 15/800
Epoch 16/800
Epoch 17/800
Epoch 18/800
Epoch 19/800
Epoch 20/800
Epoch 21/800
Epoch 22/800
Epoch 23/800
Epoch 24/800
Epoch 25/800
Epoch 26/800
Epoch 27/800
Epoch 28/800
Epoch 29/800
Epoch 30/800
Epoch 31/800
Epoch 32/800
Epoch 33/800
Epoch 34/800
Epoch 35/800
Epoch 36/800
Epoch 37

<tensorflow.python.keras.callbacks.History at 0x205aa3c0988>

In [360]:
from sklearn.metrics import cohen_kappa_score
features_results = features_model.predict(features_array_test[:,:-1]).flatten()
feature_results = [int(r) for r in features_results]

# print(results)
# print(X1.shape)
# print(y1_train.shape)
# print(y1_test[1:])
# print(len(results))
percent = cohen_kappa_score(features_array_test[:,-1],feature_results,weights='quadratic')
print(percent)

0.7644932797650741


In [399]:
import tensorflow as tf
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Lambda, Flatten
from tensorflow.keras.models import Sequential, load_model, model_from_config
import tensorflow.keras.backend as K

def get_combined_model_nn():
    model = tf.keras.Sequential()
    model.add(tf.keras.layers.Dense(128, input_dim=308, activation='sigmoid'))
    model.add(tf.keras.layers.Dense(64, activation='sigmoid'))
    model.add(tf.keras.layers.Dense(24, activation='sigmoid'))
    model.add(tf.keras.layers.Dense(1, activation='relu'))
    print(model.summary())
    model.compile(optimizer='rmsprop', loss='mean_squared_error', metrics=['accuracy'])
    return model

In [368]:
combined_matrix_train = np.concatenate((embedding_matrix_train, features_array_train[:,:-1]), axis=1)
combined_matrix_val = np.concatenate((embedding_matrix_val, features_array_val[:,:-1]), axis=1)
combined_matrix_test = np.concatenate((embedding_matrix_test, features_array_test[:,:-1]), axis=1)
print(combined_matrix_train.shape)
print(combined_matrix_val.shape)
print(combined_matrix_test.shape)

(1427, 308)
(178, 308)
(178, 308)


In [402]:
combined_model = get_combined_model_nn()
combined_model.fit(combined_matrix_train, y1_train, epochs=500,validation_data=(combined_matrix_val,y1_val))

Model: "sequential_53"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_160 (Dense)            (None, 128)               39552     
_________________________________________________________________
dense_161 (Dense)            (None, 64)                8256      
_________________________________________________________________
dense_162 (Dense)            (None, 24)                1560      
_________________________________________________________________
dense_163 (Dense)            (None, 1)                 25        
Total params: 49,393
Trainable params: 49,393
Non-trainable params: 0
_________________________________________________________________
None
Train on 1427 samples, validate on 178 samples
Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/

<tensorflow.python.keras.callbacks.History at 0x205b177be48>

In [403]:
from sklearn.metrics import cohen_kappa_score
combined_results = combined_model.predict(combined_matrix_test).flatten()
combined_results = [int(r) for r in combined_results]

# print(results)
# print(X1.shape)
# print(y1_train.shape)
# print(y1_test[1:])
# print(len(results))
percent = cohen_kappa_score(y1_test,combined_results,weights='quadratic')
print(percent)

0.7725496091401083


In [None]:
essay = "Tokenizer's divde strings int lists of substrings. For exaple, divide strings"
# syntactic_correctness(tokenize_sentences(essay)[0][0])
print(tokenize_words(essay)[1])
result = cohen_kappa_score(y_test.values,y_pred,weights='quadratic')