In [1]:
# import libraries

import re,string
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity

from sklearn.manifold import MDS


from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, accuracy_score, confusion_matrix
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.tree import DecisionTreeClassifier
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, KFold

import pandas as pd
from pandas import DataFrame
import os

from gensim.models import Word2Vec,LdaMulticore, TfidfModel
from gensim import corpora

from keras.preprocessing.text import Tokenizer
from keras.utils.vis_utils import plot_model
from keras.models import Sequential
from keras.layers import Dense


from gensim.models.doc2vec import Doc2Vec, TaggedDocument


import numpy as np

from collections import Counter
import random

import time

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
# clean doc

def clean_doc(doc): 
    #split document into individual words
    tokens=doc.split()
    re_punc = re.compile('[%s]' % re.escape(string.punctuation))
    # remove punctuation from each word
    tokens = [re_punc.sub('', w) for w in tokens]
    # remove remaining tokens that are not alphabetic
    tokens = [word for word in tokens if word.isalpha()]
    # filter out short tokens
    tokens = [word for word in tokens if len(word) > 2]
    #lowercase all words
    tokens = [word.lower() for word in tokens]
    # filter out stop words
    stop_words = set(stopwords.words('english'))
    tokens = [w for w in tokens if not w in stop_words]         
    return tokens

In [3]:
def save_list(lines, filename):
    data = '\n'.join(lines)
    file = open(filename, 'w')
    file.write(data)
    file.close()

In [4]:
vocab = Counter()

os.chdir(r'C:\Users\yabon\Downloads')


#read in class corpus csv into python
data=pd.read_csv('combined_corpus_with_labels.csv')

#create empty list to store text documents titles
titles=[]

#for loop which appends the DSI title to the titles list
for i in range(0,len(data)):
    temp_text=data['DSI_Title'].iloc[i]
    titles.append(temp_text)

#create empty list to store text documents
text_body=[]

#for loop which appends the text to the text_body list
for i in range(0,len(data)):
    temp_text=data['Text'].iloc[i]
    text_body.append(temp_text)

#Note: the text_body is the unprocessed list of documents read directly form 
#the csv.
    
#empty list to store processed documents
processed_text=[]
#for loop to process the text to the processed_text list
for i in text_body:
    text=clean_doc(i)
    processed_text.append(text)
    vocab.update(text)

#Note: the processed_text is the PROCESSED list of documents read directly form 
#the csv.  Note the list of words is separated by commas.


#stitch back together individual words to reform body of text
final_processed_text=[]

for i in processed_text:
    temp_DSI=i[0]
    for k in range(1,len(i)):
        temp_DSI=temp_DSI+' '+i[k]
    final_processed_text.append(temp_DSI)
    
# only keep tokens with >=5 occurences
min_occurence = 5
tokens  = [k for k,c in vocab.items() if c>= min_occurence]
save_list(tokens, 'vocab.txt')

In [5]:
# Info on our data
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 66 entries, 0 to 65
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  66 non-null     int64 
 1   DSI_Title   66 non-null     object
 2   Text        66 non-null     object
 3   category    66 non-null     int64 
dtypes: int64(2), object(2)
memory usage: 2.2+ KB


In [6]:
# prepare labels
labels = data['category']

In [7]:
#load vocabulary
file = open('vocab.txt','r')
text = file.read()
file.close()
vocab = set(text.split())

In [8]:
def create_tokenizer(lines):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(lines)
    return tokenizer

In [9]:
#split into train/test and obtain result for each mode
X_train, X_test, y_train, y_test = train_test_split(final_processed_text, labels.values, test_size=0.1)
mode = ['binary', 'count', 'tfidf', 'freq']
n_repeats = 10
results = DataFrame()
tokenizer = create_tokenizer(X_train)

#Create model to use in all evaluations
model = Sequential()
model.add(Dense(50, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics=['accuracy'])

#iterate through each mode
for m in mode:
    time_start = time.clock()
    train = tokenizer.texts_to_matrix(X_train, mode = m)
    test = tokenizer.texts_to_matrix(X_test, mode = m)
    model.fit(train, y_train, epochs=10, verbose=2)
    scores = list()
    _, acc = model.evaluate(test, y_test, verbose=0)  
    scores.append(acc)
    scores.append(time.clock()-time_start)
    results[m] = scores
results

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


  app.launch_new_instance()



Epoch 1/10
 - 0s - loss: 0.6854 - accuracy: 0.5254
Epoch 2/10
 - 0s - loss: 0.3341 - accuracy: 1.0000
Epoch 3/10
 - 0s - loss: 0.1755 - accuracy: 1.0000
Epoch 4/10
 - 0s - loss: 0.0920 - accuracy: 1.0000
Epoch 5/10
 - 0s - loss: 0.0469 - accuracy: 1.0000
Epoch 6/10
 - 0s - loss: 0.0243 - accuracy: 1.0000
Epoch 7/10
 - 0s - loss: 0.0127 - accuracy: 1.0000
Epoch 8/10
 - 0s - loss: 0.0072 - accuracy: 1.0000
Epoch 9/10
 - 0s - loss: 0.0042 - accuracy: 1.0000
Epoch 10/10
 - 0s - loss: 0.0026 - accuracy: 1.0000
Epoch 1/10
 - 0s - loss: 5.8730e-04 - accuracy: 1.0000
Epoch 2/10
 - 0s - loss: 3.6882e-04 - accuracy: 1.0000
Epoch 3/10
 - 0s - loss: 2.5998e-04 - accuracy: 1.0000




Epoch 4/10
 - 0s - loss: 1.9118e-04 - accuracy: 1.0000
Epoch 5/10
 - 0s - loss: 1.4428e-04 - accuracy: 1.0000
Epoch 6/10
 - 0s - loss: 1.1364e-04 - accuracy: 1.0000
Epoch 7/10
 - 0s - loss: 9.4207e-05 - accuracy: 1.0000
Epoch 8/10
 - 0s - loss: 7.9173e-05 - accuracy: 1.0000
Epoch 9/10
 - 0s - loss: 6.9806e-05 - accuracy: 1.0000
Epoch 10/10
 - 0s - loss: 6.1458e-05 - accuracy: 1.0000
Epoch 1/10
 - 0s - loss: 1.5138e-10 - accuracy: 1.0000
Epoch 2/10
 - 0s - loss: 1.2602e-10 - accuracy: 1.0000
Epoch 3/10
 - 0s - loss: 1.0614e-10 - accuracy: 1.0000
Epoch 4/10
 - 0s - loss: 9.4719e-11 - accuracy: 1.0000
Epoch 5/10
 - 0s - loss: 8.5217e-11 - accuracy: 1.0000
Epoch 6/10
 - 0s - loss: 7.8047e-11 - accuracy: 1.0000
Epoch 7/10
 - 0s - loss: 7.3007e-11 - accuracy: 1.0000
Epoch 8/10
 - 0s - loss: 6.8484e-11 - accuracy: 1.0000
Epoch 9/10
 - 0s - loss: 6.5678e-11 - accuracy: 1.0000
Epoch 10/10
 - 0s - loss: 6.2894e-11 - accuracy: 1.0000
Epoch 1/10
 - 0s - loss: 0.6718 - accuracy: 1.0000
Epoch 2/10
 

Unnamed: 0,binary,count,tfidf,freq
0,1.0,1.0,1.0,0.285714
1,2.195724,0.489018,0.582758,0.434876


In [10]:
def predict_sentiments(review, vocab, tokenizer, model):
    tokens = clean_doc(review)
    tokens = [w for w in tokens if w in vocab]
    line = ' '.join(tokens)
    encoded = tokenizer.texts_to_matrix([line], mode = 'count')
    yhat = model.predict(encoded, verbose=0)
    percent_pos = yhat[0,0]
    if round(percent_pos)==0:
        return 'ACTION/ADVENTURE'
    return 'OTHER'

In [11]:
#predict sentiments from 10 randomly selected files in original corpus
results = DataFrame()
for i in range(10):
    scores = list()
    num = random.randint(0,len(data))
    temp_text=data['Text'].iloc[num]
    cat = data['category'].iloc[num]
    if cat==0:
        scores.append('ACTION/ADVENTURE')
    else:
        scores.append('OTHERS')
    scores.append(predict_sentiments(temp_text, vocab, tokenizer, model))
    print(scores)

['ACTION/ADVENTURE', 'ACTION/ADVENTURE']
['OTHERS', 'OTHER']
['OTHERS', 'OTHER']
['OTHERS', 'OTHER']
['OTHERS', 'OTHER']
['OTHERS', 'OTHER']
['OTHERS', 'OTHER']
['ACTION/ADVENTURE', 'ACTION/ADVENTURE']
['OTHERS', 'OTHER']
['ACTION/ADVENTURE', 'ACTION/ADVENTURE']


In [12]:
#predict sentiments from 10 randomly selected files in untouched corpus
test_data=pd.read_csv('Random_samples.csv')
test_data.dropna()
for i in range(len(test_data)):
    scores = list()
    temp_text=test_data['Title'].iloc[i]
    cat = test_data['Category'].iloc[i]
    if cat==0:
        scores.append('ACTION/ADVENTURE')
    else:
        scores.append('OTHERS')
    scores.append(predict_sentiments(temp_text, vocab, tokenizer, model))
    print(scores)

['ACTION/ADVENTURE', 'ACTION/ADVENTURE']
['ACTION/ADVENTURE', 'ACTION/ADVENTURE']
['OTHERS', 'OTHER']
['OTHERS', 'OTHER']
