In [1]:
import numpy as np
import pandas as pd 
from sklearn.feature_extraction.text import CountVectorizer
from keras.preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
from sklearn.utils import resample
from sklearn.utils import shuffle
from sklearn.metrics import confusion_matrix,classification_report
import re
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import KFold



In [2]:
#1.read in data
filenames=["covid_preprocess_full.csv","email_preprocess_full.csv","imdb_preprocess_full.csv",
          "news_preprocess_full.csv","review_preprocess_full.csv","twitter_preprocess.csv"]
data = pd.read_csv('dataset/'+filenames[0])
#use two column
data = data[['sw_exclude','sentiment']]
#use 1000 data
data=data[:1000]
data.head()

Unnamed: 0,sw_exclude,sentiment
0,rajasthan wednesday report 74 new coronavirus ...,-1
1,total number coronavirus case delhi surge 3 43...,-1
2,condole demise actor rishi kapoor pm narendra ...,1
3,congress leader rahul gandhi condole rishi kap...,-1
4,sign respect healthcare professional fight cov...,1


In [3]:
#2.set data
#hyperparameters(max_fatures,embed_dim,lstm_out,batch_size,droupout_x)
#use softmax beacuse use categorical crossentropy
#X is text,Y is sentiment
max_fatures = 2000
tokenizer = Tokenizer(num_words=max_fatures, split=' ')
data['sw_exclude']=data['sw_exclude'].fillna("")
tokenizer.fit_on_texts(data['sw_exclude'].values)
X = tokenizer.texts_to_sequences(data['sw_exclude'].values)
X = pad_sequences(X)
Y = to_categorical(data['sentiment'].values+1)

#initial model and set hyperparameters
def get_model():
    #use softmax beacuse use categorical crossentropy
    embed_dim = 128
    lstm_out = 100
    model = Sequential()
    model.add(Embedding(max_fatures, embed_dim,input_length = X.shape[1]))
    model.add(SpatialDropout1D(0.4))
    model.add(LSTM(lstm_out, dropout=0.2, recurrent_dropout=0.2))
    # has to reflect the number of classes that are being predicted. Here 3 classes are being used 
    model.add(Dense(3,activation='softmax'))
    model.compile(loss = 'categorical_crossentropy', optimizer='adam',metrics = ['accuracy'])
    print(model.summary())
    return model


In [4]:
#use cv
kf = KFold(n_splits=5)

accList=list()
for train_index, test_index in kf.split(X):
    X_train=X[train_index]
    Y_train=Y[train_index]
    X_test=X[test_index]
    Y_test=Y[test_index]
    
    batch_size = 128
    model=get_model()
    model.fit(X_train, Y_train, epochs = 10, batch_size=batch_size, verbose = 1)
    
    Y_pred=np.argmax(model.predict(X_test), axis=-1)
    df_test = pd.DataFrame({'true': Y_test.tolist(), 'pred':Y_pred})
    df_test['true'] = df_test['true'].apply(lambda x: np.argmax(x))
    print("test accuracy:")
    acc=sum(df_test.pred==df_test.true)/len(X_test)
    print(acc)
    #confusion matrix
    #print("confusion matrix",confusion_matrix(df_test.true, df_test.pred))
    #print(classification_report(df_test.true, df_test.pred))
    accList.append(acc)

print("accuracy list:{}".format(accList))
print("mean accuracy:{} ".format(np.mean(accList)))

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 55, 128)           256000    
                                                                 
 spatial_dropout1d (SpatialD  (None, 55, 128)          0         
 ropout1D)                                                       
                                                                 
 lstm (LSTM)                 (None, 100)               91600     
                                                                 
 dense (Dense)               (None, 3)                 303       
                                                                 
Total params: 347,903
Trainable params: 347,903
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
