data: https://www.kaggle.com/c/sentiment-analysis-on-movie-reviews/data  
code:  
https://www.kaggle.com/shyambhu/sentiment-classification-using-lstm

## Steps
step1 > [LOAD DATA](#load_data)  
step2 > [DATA_CLEANING](#cleaning)  
step3 > [model](#model)  
step4 > [Classification_Report](#class)

In [17]:
import pandas as pd
from nltk.corpus import stopwords, wordnet
import nltk
from nltk.stem import WordNetLemmatizer
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import re

from keras.models import Sequential
import keras
from keras.layers import Dense, LSTM, Activation, Embedding, Bidirectional, Dropout
from tensorflow.keras.utils import plot_model
from sklearn.model_selection import train_test_split
import numpy as np

from sklearn.metrics import classification_report, confusion_matrix

<a id = load_data> </a>
## LOAD DATA

In [2]:
# load data
pd.set_option('display.max_columns', None)
train = pd.read_csv('/home/bettyliao/sentiment/data/kaggle_movie_reviews/train.tsv', sep = '\t')
test = pd.read_csv('/home/bettyliao/sentiment/data/kaggle_movie_reviews/test.tsv', sep = '\t')
print(f"""
train :{train.info()}\n
test :{test.info()}
""")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 156060 entries, 0 to 156059
Data columns (total 4 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   PhraseId    156060 non-null  int64 
 1   SentenceId  156060 non-null  int64 
 2   Phrase      156060 non-null  object
 3   Sentiment   156060 non-null  int64 
dtypes: int64(3), object(1)
memory usage: 4.8+ MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 66292 entries, 0 to 66291
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   PhraseId    66292 non-null  int64 
 1   SentenceId  66292 non-null  int64 
 2   Phrase      66292 non-null  object
dtypes: int64(2), object(1)
memory usage: 1.5+ MB

train :None

test :None



In [3]:
# 各類別百分比
train = train[['Phrase', 'Sentiment']]
percent = round(train.Sentiment.value_counts()/len(train)*100,2) 
print(percent)

2    50.99
3    21.10
1    17.48
4     5.90
0     4.53
Name: Sentiment, dtype: float64


In [4]:
train['length'] = train['Phrase'].apply(lambda x: len(x.split())) 
print(train['length'].max())

52


<a id = cleaning></a>
## DATA CLEANING

獲取單詞在句子中的詞性，再結合詞形還原，就能很好地完成詞形還原功能。

In [5]:
def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)
# convert to lower and remove stopword
def clean_text(text):
    stopword = set(stopwords.words('english')) # load stopwords 
    lemmatizer = WordNetLemmatizer()
    text = text.lower() # convert to lower
    text = ' '.join([i for i in text.split() if i not in stopword]) 
    text = ' '.join([lemmatizer.lemmatize(i, get_wordnet_pos(i)) for i in text.split()])
    return text

In [6]:
train['clean_text'] = train['Phrase'].apply(lambda x: clean_text(x)) 

In [7]:
train.head()

Unnamed: 0,Phrase,Sentiment,length,clean_text
0,A series of escapades demonstrating the adage ...,1,37,series escapade demonstrate adage good goose a...
1,A series of escapades demonstrating the adage ...,2,14,series escapade demonstrate adage good goose
2,A series,2,2,series
3,A,2,1,
4,series,2,1,series


In [8]:
# tokenazation
max_features = 10000 # max words 10000
tokenizer = Tokenizer(num_words = max_features, split = ' ') 
tokenizer.fit_on_texts(train['clean_text'].values)
X = tokenizer.texts_to_sequences(train['clean_text'].values) 
X = pad_sequences(X, maxlen = 128)

<a id = model></a>
## model

In [9]:
y = train['Sentiment']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42) 
print(f"""
X_train.shape: {X_train.shape},
X_test.shape: {X_test.shape},
y_train.shape: {y_train.shape},
y_test.shape: {y_test.shape}
""")


X_train.shape: (124848, 128),
X_test.shape: (31212, 128),
y_train.shape: (124848,),
y_test.shape: (31212,)



In [10]:
y_train.unique()

array([1, 2, 0, 3, 4])

embedding:
1. input_dim :This is the size of the vocabulary in the text data   
2. output_dim :This is the size of the vector space in which words will be embedded.  
3. input_length :This is the length of input sequences, as you would define for any input layer of a Keras model. 

In [11]:
embed_dim = 128
#vocal_dim = len(tokenizer) +1 

model = Sequential()
inputs = keras.Input(shape = (None, ), dtype = 'int32')
model.add(inputs)
model.add(Embedding(50000, embed_dim)) # pad_sequences(maxlen = 25) 
#input_dim = vocab_size, 
#output_dim = embed_dim
model.add(Bidirectional(LSTM(64, return_sequences=True)))
model.add(Bidirectional(LSTM(64)))
model.add(Dense(5, activation = "softmax")) # softmax 總合1
model.summary()
model.compile(loss = 'sparse_categorical_crossentropy',
             optimizer = 'adam', metrics = ['accuracy'])


Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 128)         6400000   
_________________________________________________________________
bidirectional (Bidirectional (None, None, 128)         98816     
_________________________________________________________________
bidirectional_1 (Bidirection (None, 128)               98816     
_________________________________________________________________
dense (Dense)                (None, 5)                 645       
Total params: 6,598,277
Trainable params: 6,598,277
Non-trainable params: 0
_________________________________________________________________


plot_model(model, show_shapes = True)

In [28]:
history = model.fit(X_train, y_train, epochs = 10, batch_size = 32, verbose = 1, validation_data = (X_test, y_test))  

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<a id = class></a>
##  Classification Report

In [29]:
pred = model.predict(X_test)
pred = pred.argmax(axis = 1)

In [30]:
confusion = confusion_matrix(y_test, pred)

In [31]:
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.49      0.36      0.42      1416
           1       0.54      0.54      0.54      5527
           2       0.73      0.80      0.76     15639
           3       0.58      0.53      0.55      6707
           4       0.55      0.41      0.47      1923

    accuracy                           0.65     31212
   macro avg       0.58      0.53      0.55     31212
weighted avg       0.64      0.65      0.64     31212



```
def text_cleaning(text):
    stopwords = set(stopwords.words('english'))
    if text:
        text = ' '.join(text.split('.'))
        text = re.sub('\/', ' ',text)
        text = re.sub(r'\\', ' ', text)
        text = re.sub(r'((http)\S+)', '', text)
        text = re.sub(r'\s+', ' ', 
                      re.sub('[^A-Za-z]', ' ', text.strip().lower())).strip()  
        text = re.sub(r'\W+', ' ', text.strip().lower()).strip()
        text = [word for word in text.split() if word not in stopwords]
        return text
    return []
```