In [10]:
import nltk
#nltk.download()
from nltk.tokenize import word_tokenize
import numpy as np
import random
import pickle
from collections import Counter
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
hm_lines = 100000

In [11]:
def create_lexicon(pos,neg):

    lexicon = []
    with open(pos,'r') as f:
        contents = f.readlines()
        for l in contents[:hm_lines]:
            all_words = word_tokenize(l)
            lexicon += list(all_words)

    with open(neg,'r') as f:
        contents = f.readlines()
        for l in contents[:hm_lines]:
            all_words = word_tokenize(l)
            lexicon += list(all_words)
            
    lexicon = [lemmatizer.lemmatize(i) for i in lexicon]
    w_counts = Counter(lexicon)
    l2 = []
    for w in w_counts:
        #print(w_counts[w])
        if 1000 > w_counts[w] > 50:
            l2.append(w)
    print(len(l2))
    return l2

In [12]:
def sample_handling(sample,lexicon,classification):

    featureset = []

    with open(sample,'r') as f:
        contents = f.readlines()
        for l in contents[:hm_lines]:
            current_words = word_tokenize(l.lower())
            current_words = [lemmatizer.lemmatize(i) for i in current_words]
            features = np.zeros(len(lexicon))
            for word in current_words:
                if word.lower() in lexicon:
                    index_value = lexicon.index(word.lower())
                    features[index_value] += 1
    
            features = list(features)
            featureset.append([features,classification])

    return featureset

In [13]:
def create_feature_sets_and_labels(pos,neg,test_size = 0.1):
    lexicon = create_lexicon(pos,neg)
    features = []
    features += sample_handling('pos.txt',lexicon,[1,0])
    features += sample_handling('neg.txt',lexicon,[0,1])
    random.shuffle(features)
    features = np.array(features)

    testing_size = int(test_size*len(features))

    train_x = list(features[:,0][:-testing_size])
    train_y = list(features[:,1][:-testing_size])
    test_x = list(features[:,0][-testing_size:])
    test_y = list(features[:,1][-testing_size:])

    return train_x,train_y,test_x,test_y

In [14]:
if __name__ == '__main__':
    train_x,train_y,test_x,test_y = create_feature_sets_and_labels('pos.txt','neg.txt')
    # if you want to pickle this data:
    with open('sentiment_set.pickle','wb') as f:
        pickle.dump([train_x,train_y,test_x,test_y],f)

423


In [20]:
import tensorflow as tf

train_x,train_y,test_x,test_y = create_feature_sets_and_labels('pos.txt','neg.txt')

423


In [25]:
n_nodes_hl1 = 500 #no of nodes in hl1
n_nodes_hl2 = 600 #no of nodes in hl2
n_nodes_hl3 = 400 #no of nodes in hl3

n_classes = 2 #no of classes [positive/negative] 
batch_size = 100

In [26]:
x = tf.placeholder('float', [None, len(train_x[0])])
y = tf.placeholder('float')

In [27]:
def neural_network_model(data):
    
    hidden_layer_1 = {'Weights':tf.Variable(tf.random_normal([len(train_x[0]),n_nodes_hl1])),
                     'biases':tf.Variable(tf.random_normal([n_nodes_hl1]))}
    
    hidden_layer_2 = {'Weights':tf.Variable(tf.random_normal([n_nodes_hl1,n_nodes_hl2])),
                     'biases':tf.Variable(tf.random_normal([n_nodes_hl2]))}
    
    hidden_layer_3 = {'Weights':tf.Variable(tf.random_normal([n_nodes_hl2,n_nodes_hl3])),
                     'biases':tf.Variable(tf.random_normal([n_nodes_hl3]))}
    
    output_layer = {'Weights':tf.Variable(tf.random_normal([n_nodes_hl3,n_classes])),
                     'biases':tf.Variable(tf.random_normal([n_classes]))}
    
    l1 = tf.add(tf.matmul(data, hidden_layer_1['Weights']), hidden_layer_1['biases'])
    l1 = tf.nn.relu(l1)
    
    l2 = tf.add(tf.matmul(l1, hidden_layer_2['Weights']), hidden_layer_2['biases'])
    l2 = tf.nn.relu(l2)
    
    l3 = tf.add(tf.matmul(l2, hidden_layer_3['Weights']), hidden_layer_3['biases'])
    l3 = tf.nn.relu(l3)
    
    output = tf.add(tf.matmul(l3, output_layer['Weights']), output_layer['biases'])
 
    return output

In [31]:
def train_network(x): 
    
    prediction = neural_network_model(x)
    
    cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(logits = prediction, labels = y))
    optimizer = tf.train.AdamOptimizer().minimize(cost)
    
    #feed_forwards + backprops
    hm_epochs = 10
    
    with tf.Session() as sess: 
        sess.run(tf.global_variables_initializer())
        
        for epoch in range(hm_epochs):
            epoch_loss= 0 
            
            i=0
            while i< len(train_x):
                start = i
                end = i + batch_size
                batch_x = np.array(train_x[start:end])
                batch_y = np.array(train_y[start:end])
        
                _ , c= sess.run([optimizer,cost], feed_dict = {x:batch_x, y:batch_y})
                epoch_loss += c
                i += batch_size
                
            print("Epoch",epoch+1, 'completed out of',hm_epochs,'loss: ', epoch_loss)
        correct = tf.equal(tf.argmax(prediction,1), tf.argmax(y,1))
        accuracy = tf.reduce_mean(tf.cast(correct,'float'))
        print('Accuracy: ', accuracy.eval({x:test_x, y: test_y}))
           

In [32]:
train_network(x)

Epoch 1 completed out of 10 loss:  268219.544922
Epoch 2 completed out of 10 loss:  122638.123474
Epoch 3 completed out of 10 loss:  78026.0900879
Epoch 4 completed out of 10 loss:  52860.0198364
Epoch 5 completed out of 10 loss:  39062.7588043
Epoch 6 completed out of 10 loss:  39284.0993347
Epoch 7 completed out of 10 loss:  30861.2181702
Epoch 8 completed out of 10 loss:  22017.698822
Epoch 9 completed out of 10 loss:  13842.7756691
Epoch 10 completed out of 10 loss:  7985.73712254
Accuracy:  0.573171
