# 介绍 

NLP情感分析学习. 参考资料[kaggle kernel: Movie Review Sentiment Analysis EDA and models](https://www.kaggle.com/artgor/movie-review-sentiment-analysis-eda-and-models)

# Modules 

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from nltk import TweetTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.multiclass import OneVsRestClassifier
from sklearn.model_selection import cross_val_score
%matplotlib inline

# EDA

In [None]:
train = pd.read_csv('../input/movie-review-sentiment-analysis-kernels-only/train.tsv', sep="\t")
test = pd.read_csv('../input/movie-review-sentiment-analysis-kernels-only/test.tsv', sep="\t")
sub = pd.read_csv('../input/movie-review-sentiment-analysis-kernels-only/sampleSubmission.csv', sep=",")

In [None]:
train.head()

In [None]:
help(np.percentile)

In [None]:
def df_info(df):
    print('='*20)
    print('# of row in df: {}'.format(len(df)))
    print('# of PhraseId: {0}'.format(df.groupby('PhraseId').size().sum()))
    print('avg # of SentencId in every PhraseId :{0:.0f}'.format(df.groupby('SentenceId').size().mean()))
#     print('avg # of word in each sentence is: {0:.0f}'.format(df.apply(lambda row: len(row['Phrase'].split(' ')), axis=1).mean()))
    values = df.apply(lambda row: len(row['Phrase'].split(' ')), axis=1).values
#     print(len(values))
#     print(np.percentile(values,[0, 25, 50, 75, 100]))
    print('stat # of word in each sentence is: {}'.format(
        np.percentile(
            values, 
            [0, 25, 50, 75, 100]
        )
    )
    )
    
    print('='*20)

In [None]:
df_info(train)
df_info(test)

In [None]:
df = train
values = df.apply(lambda row: len(row['Phrase'].split(' ')), axis=1).values

In [None]:
np.percentile(values, [0, 50])

# 传统机器学习过程 

## Feature extraction 

In [None]:
tokenizer = TweetTokenizer()

In [None]:
vectorizer = TfidfVectorizer(ngram_range=[1, 2], tokenizer=tokenizer.tokenize)
full_text = list(train['Phrase'].values) + list(test['Phrase'].values)

In [None]:
vectorizer.fit(full_text)

In [None]:
train_x = vectorizer.transform(train['Phrase'])
test_x = vectorizer.transform(test['Phrase'])

In [None]:
train_x.shape

In [None]:
test_x.shape

In [None]:
y = train['Sentiment']

## Train & evalationa 

### Logistic regression 

In [None]:
log = LogisticRegression()
ovr = OneVsRestClassifier(log)

In [None]:
score = cross_val_score(ovr, X=train_x, y = y, cv=3).mean()
print(score)

In [None]:
svm = LinearSVC(
#     dual='dual'
)
score = cross_val_score(svm, X=train_x, y = y, cv=3).mean()
print(score)

# 深度学习 

In [None]:
from keras.layers import Input, Dense, Dropout, Flatten, SpatialDropout1D, BatchNormalization
from keras.layers import GlobalMaxPooling1D, GlobalAvgPool1D
from keras.layers import Embedding
from keras.layers import CuDNNGRU, CuDNNLSTM
from keras.layers import Bidirectional
from keras.layers import Conv1D
from keras.layers import concatenate
from keras.optimizers import Adam
from keras.callbacks import ModelCheckpoint, EarlyStopping, TensorBoard
from keras.models import Model, load_model
from keras.preprocessing.text import Tokenizer, one_hot
from keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import OneHotEncoder

## Tokenlized and embedding 

### Tokenlized 

In [None]:
# tk = Tokenizer(lower=True, filtesr=' ')
tk = Tokenizer(lower=True, filters='')
tk.fit_on_texts(full_text)

In [None]:
train.info()

In [None]:
train_tokenized = tk.texts_to_sequences(train['Phrase'])
test_tokenized = tk.texts_to_sequences(test['Phrase'])

###  pad

In [None]:
max_len = 50
train_X = pad_sequences(maxlen=max_len, sequences=train_tokenized)
test_X = pad_sequences(maxlen=max_len, sequences=test_tokenized)

In [None]:
# ?????

embedding_path = "../input/fasttext-crawl-300d-2m/crawl-300d-2M.vec"
embed_size = 300
max_features = 30000
def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
embedding_index = dict(get_coefs(*o.strip().split(" ")) for o in open(embedding_path))

word_index = tk.word_index
nb_words = min(max_features, len(word_index))
embedding_matrix = np.zeros((nb_words + 1, embed_size))
for word, i in word_index.items():
    if i >= max_features: continue
    embedding_vector = embedding_index.get(word)
    if embedding_vector is not None: embedding_matrix[i] = embedding_vector

In [None]:
ohe = OneHotEncoder(sparse=False)
y_ohe = ohe.fit_transform(y.values.reshape(-1, 1))

In [None]:
np.shape(y_ohe)

## Model defination and train 

In [None]:
def build_model1(units, spatial_dr=1, cov_size=32, kernel_size1=3, kernel_size2=2, dense_unit=128, dr=0.1, lr=0.001, lr_d=0.0):
    file_path = 'model.hdf5'
    check_point = ModelCheckpoint(
        filepath=file_path,
        verbose = 1,
        save_best_only = True,
        mode = 'min'
    )
    earlystopping = EarlyStopping(
#         monitor = 'binary_crossentropy'
        monitor = 'val_loss',
        patience = 3,
        mode = 'min'
    )
    
    # Embedding layer 
#     ????????  shape ??
#     inp = Input(shape=(embed_size, ))
    inp = Input(shape=(max_len, ))
    x = Embedding(
        input_dim = 19479,
        output_dim = embed_size, 
        weights = [embedding_matrix],
        trainable = False
    )(inp)
    x = SpatialDropout1D(spatial_dr)(x)
    
    # return_sequences=True, 输出cell_state 和hiden_state
    x_gru = Bidirectional(CuDNNGRU(units, return_sequences=True))(x)
#     print(x_gru.shape)
    x1 = Conv1D(
        filters = cov_size,
        kernel_size=kernel_size1,
        padding = 'valid',
        kernel_initializer = 'he_uniform'
    )(x_gru)
    maxpool_x1 = GlobalMaxPooling1D()(x1)
    avgpool_x1 = GlobalAvgPool1D()(x1)
    
    x2 = Conv1D(
        filters = cov_size,
        kernel_size=kernel_size2,
        padding = 'valid',
        kernel_initializer = 'he_uniform'        
    )(x_gru)
    maxpool_x2 = GlobalMaxPooling1D()(x2)
    avgpool_x2 = GlobalAvgPool1D()(x2)    
    
    x_lstm = Bidirectional(CuDNNLSTM(units, return_sequences=True))(x1)
    x3 = Conv1D(
        filters = cov_size,
        kernel_size=kernel_size1,
        padding = 'valid',
        kernel_initializer = 'he_uniform'
    )(x_lstm)
    maxpool_x3 = GlobalMaxPooling1D()(x3)
    avgpool_x3 = GlobalAvgPool1D()(x3)   
    
    x4 = Conv1D(
        filters = cov_size,
        kernel_size=kernel_size2,
        padding = 'valid',
        kernel_initializer = 'he_uniform'
    )(x_lstm)
    maxpool_x4 = GlobalMaxPooling1D()(x4)
    avgpool_x4 = GlobalAvgPool1D()(x4)
    
    x = concatenate([
                     maxpool_x1, avgpool_x1, 
                     maxpool_x2, avgpool_x2,
                     maxpool_x3, avgpool_x3,
                     maxpool_x4, avgpool_x4
                    ])
    x = BatchNormalization()(x)
    x = Dropout(dr)(Dense(dense_unit, activation='relu')(x))
    x = BatchNormalization()(x)
    x = Dropout(dr)(Dense(int(dense_unit/2), activation='relu')(x))
#     output = Dense(5, activation='softmax')(x)
    output = Dense(5, activation='sigmoid')(x)
    
    model = Model(input=inp, output=output)
    model.compile(
        optimizer = Adam(lr=lr, decay=lr_d),
        loss='binary_crossentropy', 
        metrics=['accuracy'],
    )
    model.fit(
#         x = embedding_matrix,
        x = train_X,
#         y = y,
        y= y_ohe,
        batch_size = 128,
        epochs = 20,
        verbose = 1,
        callbacks = [check_point, earlystopping],
        validation_split = 0.1
    )
    model.load_model(file_path)
    return model


In [None]:
model1 = build_model1(lr = 1e-3, lr_d = 1e-10, units = 64, spatial_dr = 0.3, kernel_size1=3, kernel_size2=2, dense_unit=32, dr=0.1, cov_size=32)

## General information

In this kernel I'll work with data from Movie Review Sentiment Analysis Playground Competition.

This dataset is interesting for NLP researching. Sentences from original dataset were split in separate phrases and each of them has a sentiment label. Also a lot of phrases are really short which makes classifying them quite challenging. Let's try!

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from nltk.tokenize import TweetTokenizer
import datetime
import lightgbm as lgb
from scipy import stats
from scipy.sparse import hstack, csr_matrix
from sklearn.model_selection import train_test_split, cross_val_score
from wordcloud import WordCloud
from collections import Counter
from nltk.corpus import stopwords
from nltk.util import ngrams
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.multiclass import OneVsRestClassifier
pd.set_option('max_colwidth',400)

# 探索 

In [None]:
train = pd.read_csv('../input/movie-review-sentiment-analysis-kernels-only/train.tsv', sep="\t")
test = pd.read_csv('../input/movie-review-sentiment-analysis-kernels-only/test.tsv', sep="\t")
sub = pd.read_csv('../input/movie-review-sentiment-analysis-kernels-only/sampleSubmission.csv', sep=",")

In [None]:
train.head(10)

In [None]:
train.loc[train.SentenceId == 2]

In [None]:
print('Average count of phrases per sentence in train is {0:.0f}.'.format(train.groupby('SentenceId')['Phrase'].count().mean()))
print('Average count of phrases per sentence in test is {0:.0f}.'.format(test.groupby('SentenceId')['Phrase'].count().mean()))

In [None]:
print('Number of phrases in train: {}. Number of sentences in train: {}.'.format(train.shape[0], len(train.SentenceId.unique())))
print('Number of phrases in test: {}. Number of sentences in test: {}.'.format(test.shape[0], len(test.SentenceId.unique())))

In [None]:
print('Average word length of phrases in train is {0:.0f}.'.format(np.mean(train['Phrase'].apply(lambda x: len(x.split())))))
print('Average word length of phrases in test is {0:.0f}.'.format(np.mean(test['Phrase'].apply(lambda x: len(x.split())))))

In [None]:
train['Sentiment'].value_counts()

`We can see than sentences were split in 18-20 phrases at average and a lot of phrases contain each other. **Sometimes one word or even one punctuation mark influences the sentiment**

Let's see for example most common trigrams for positive phrases

ngrams:  几个为一组

In [None]:
text = ' '.join(train.loc[train.Sentiment == 4, 'Phrase'].values)
text_trigrams = [i for i in ngrams(text.split(), 3)]

In [None]:
text_trigrams[:3]

In [None]:
Counter(text_trigrams).most_common(30)

In [None]:
text = ' '.join(train.loc[train.Sentiment == 4, 'Phrase'].values)
text = [i for i in text.split() if i not in stopwords.words('english')]
text_trigrams = [i for i in ngrams(text, 3)]
Counter(text_trigrams).most_common(30)

The results show the main problem with this dataset: **there are to many common words due to sentenced splitted in phrases**. As a result stopwords shouldn't be removed from text.

### Thoughts on feature processing and engineering

So, we have only phrases as data. And a phrase can contain a single word. And one punctuation mark can cause phrase to receive a different sentiment. Also assigned sentiments can be strange. This means several things:
- using stopwords can be a bad idea, especially when phrases contain one single stopword;
- puntuation could be important, so it should be used;
- ngrams are necessary to get the most info from data;
- using features like word count or sentence length won't be useful;

# 编码  tokenizer

In [None]:
tokenizer = TweetTokenizer()

?? ngram_range

编码

In [None]:
vectorizer = TfidfVectorizer(ngram_range=(1, 2), tokenizer=tokenizer.tokenize)
full_text = list(train['Phrase'].values) + list(test['Phrase'].values)
vectorizer.fit(full_text)
train_vectorized = vectorizer.transform(train['Phrase'])
test_vectorized = vectorizer.transform(test['Phrase'])

In [None]:
y = train['Sentiment']

In [None]:
train_vectorized[:2]

In [None]:
help(OneVsRestClassifier)

# 训练 

In [None]:
logreg = LogisticRegression()
ovr = OneVsRestClassifier(logreg)

In [None]:
%%time
ovr.fit(train_vectorized, y)

In [None]:
help(LinearSVC)

In [None]:
scores = cross_val_score(ovr, train_vectorized, y, scoring='accuracy', n_jobs=-1, cv=3)
print('Cross-validation mean accuracy {0:.2f}%, std {1:.2f}.'.format(np.mean(scores) * 100, np.std(scores) * 100))

In [None]:
%%time
svc = LinearSVC(dual=False)
scores = cross_val_score(svc, train_vectorized, y, scoring='accuracy', n_jobs=-1, cv=3)
print('Cross-validation mean accuracy {0:.2f}%, std {1:.2f}.'.format(np.mean(scores) * 100, np.std(scores) * 100))

# 评估 

In [None]:
ovr.fit(train_vectorized, y);
svc.fit(train_vectorized, y);

## Deep learning
And now let's try DL. DL should work better for text classification with multiple layers. I use an architecture similar to those which were used in toxic competition.

In [None]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation, Conv1D, GRU, CuDNNGRU, CuDNNLSTM, BatchNormalization
from keras.layers import Bidirectional, GlobalMaxPool1D, MaxPooling1D, Add, Flatten
from keras.layers import GlobalAveragePooling1D, GlobalMaxPooling1D, concatenate, SpatialDropout1D
from keras.models import Model, load_model
from keras import initializers, regularizers, constraints, optimizers, layers, callbacks
from keras import backend as K
from keras.engine import InputSpec, Layer
from keras.optimizers import Adam
from keras.callbacks import ModelCheckpoint, TensorBoard, Callback, EarlyStopping

1.keras.layers:
    1.  Bidirectional rnn: 双向rnn
    2. GRU:  Gated Recurrent Unit, LSTM 变体, 加入忘记门和输入门 
    3.  GlobalMaxPool1D, MaxPooling1D [差别](https://stackoverflow.com/questions/43728235/what-is-the-difference-between-keras-maxpooling1d-and-globalmaxpooling1d-functi)
    4. constrains:  functions that impose constraints on weight values(e.g MaxNorm, MinMaxNorm)

# 编码 tokennizer

In [None]:
import keras
dir(keras)

In [None]:
tk = Tokenizer(lower = True, filters='')
tk.fit_on_texts(full_text)

In [None]:
train_tokenized = tk.texts_to_sequences(train['Phrase'])
test_tokenized = tk.texts_to_sequences(test['Phrase'])

In [None]:
print(len(train_tokenized))
print(len(train_tokenized[0]))
print(len(train_tokenized[20]))


# print(test_tokenized.shape)

# pad ?? 

In [None]:
max_len = 50
X_train = pad_sequences(train_tokenized, maxlen = max_len)
X_test = pad_sequences(test_tokenized, maxlen = max_len)

In [None]:
X_train.shape

In [None]:
X_test.shape

# Embedding 

In [None]:
embedding_path = "../input/fasttext-crawl-300d-2m/crawl-300d-2M.vec"

In [None]:
embed_size = 300
max_features = 30000

In [None]:
def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
embedding_index = dict(get_coefs(*o.strip().split(" ")) for o in open(embedding_path))

In [None]:
word_index = tk.word_index
nb_words = min(max_features, len(word_index))
embedding_matrix = np.zeros((nb_words + 1, embed_size))
for word, i in word_index.items():
    if i >= max_features: continue
    embedding_vector = embedding_index.get(word)
    if embedding_vector is not None: embedding_matrix[i] = embedding_vector

In [None]:
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder(sparse=False)
y_ohe = ohe.fit_transform(y.values.reshape(-1, 1))

In [None]:
embedding_matrix.shape

# build model and train 

In [None]:
help(CuDNNGRU)

In [None]:
import tensorflow as tf
def build_model1(lr=0.0, lr_d=0.0, units=0, spatial_dr=0.0, kernel_size1=3, kernel_size2=2, dense_units=128, dr=0.1, conv_size=32):
    file_path = "best_model.hdf5"
    # callback function; save the model after every epoch
    check_point = ModelCheckpoint(file_path, monitor = "val_loss", verbose = 1,
                                  save_best_only = True, mode = "min")
    early_stop = EarlyStopping(monitor = "val_loss", mode = "min", patience = 3)
    
    inp = Input(shape = (max_len,))
    # 直接load训练好的为embedding_matrix 没在这儿训练
    x = Embedding(19479, embed_size, weights = [embedding_matrix], trainable = False)(inp)
    # 丢掉整个dim, 普通的丢掉几个元素而已
    # ? 下一步如何衔接dim?
    x1 = SpatialDropout1D(spatial_dr)(x)
    
    # bidirection wrapper in rnn
    # CuDNN... 英伟达出品,只能在GPU上跑
    x_gru = Bidirectional(CuDNNGRU(units, return_sequences = True))(x1)
    x1 = Conv1D(conv_size, kernel_size=kernel_size1, padding='valid', kernel_initializer='he_uniform')(x_gru)
    # global average pooling operation for temporal data
    avg_pool1_gru = GlobalAveragePooling1D()(x1)
    max_pool1_gru = GlobalMaxPooling1D()(x1)
    
    x3 = Conv1D(conv_size, kernel_size=kernel_size2, padding='valid', kernel_initializer='he_uniform')(x_gru)
    avg_pool3_gru = GlobalAveragePooling1D()(x3)
    max_pool3_gru = GlobalMaxPooling1D()(x3)
    
    x_lstm = Bidirectional(CuDNNLSTM(units, return_sequences = True))(x1)
    x1 = Conv1D(conv_size, kernel_size=kernel_size1, padding='valid', kernel_initializer='he_uniform')(x_lstm)
    avg_pool1_lstm = GlobalAveragePooling1D()(x1)
    max_pool1_lstm = GlobalMaxPooling1D()(x1)
    
    x3 = Conv1D(conv_size, kernel_size=kernel_size2, padding='valid', kernel_initializer='he_uniform')(x_lstm)
    avg_pool3_lstm = GlobalAveragePooling1D()(x3)
    max_pool3_lstm = GlobalMaxPooling1D()(x3)
    
    
    x = concatenate([avg_pool1_gru, max_pool1_gru, avg_pool3_gru, max_pool3_gru,
                    avg_pool1_lstm, max_pool1_lstm, avg_pool3_lstm, max_pool3_lstm])
    x = BatchNormalization()(x)
    x = Dropout(dr)(Dense(dense_units, activation='relu') (x))
    x = BatchNormalization()(x)
    x = Dropout(dr)(Dense(int(dense_units / 2), activation='relu') (x))
    x = Dense(5, activation = "sigmoid")(x)
    model = Model(inputs = inp, outputs = x)
    # 
    model.compile(loss = "binary_crossentropy", optimizer = Adam(lr = lr, decay = lr_d), metrics = ["accuracy"])
    history = model.fit(X_train, y_ohe, batch_size = 128, epochs = 20, validation_split=0.1, 
                        verbose = 1, callbacks = [check_point, early_stop])
    model = load_model(file_path)
    writer = tf.summary.FileWriter('logs/', sess.graph)
    return model

An attempt at ensemble:

In [None]:
model1 = build_model1(lr = 1e-3, lr_d = 1e-10, units = 64, spatial_dr = 0.3, kernel_size1=3, kernel_size2=2, dense_units=32, dr=0.1, conv_size=32)

In [None]:
model2 = build_model1(lr = 1e-3, lr_d = 1e-10, units = 128, spatial_dr = 0.5, kernel_size1=3, kernel_size2=2, dense_units=64, dr=0.2, conv_size=32)

In [None]:
def build_model2(lr=0.0, lr_d=0.0, units=0, spatial_dr=0.0, kernel_size1=3, kernel_size2=2, dense_units=128, dr=0.1, conv_size=32):
    file_path = "best_model.hdf5"
    check_point = ModelCheckpoint(file_path, monitor = "val_loss", verbose = 1,
                                  save_best_only = True, mode = "min")
    early_stop = EarlyStopping(monitor = "val_loss", mode = "min", patience = 3)

    inp = Input(shape = (max_len,))
    x = Embedding(19479, embed_size, weights = [embedding_matrix], trainable = False)(inp)
    x1 = SpatialDropout1D(spatial_dr)(x)

    x_gru = Bidirectional(CuDNNGRU(units, return_sequences = True))(x1)
    x_lstm = Bidirectional(CuDNNLSTM(units, return_sequences = True))(x1)
    
    x_conv1 = Conv1D(conv_size, kernel_size=kernel_size1, padding='valid', kernel_initializer='he_uniform')(x_gru)
    avg_pool1_gru = GlobalAveragePooling1D()(x_conv1)
    max_pool1_gru = GlobalMaxPooling1D()(x_conv1)
    
    x_conv2 = Conv1D(conv_size, kernel_size=kernel_size2, padding='valid', kernel_initializer='he_uniform')(x_gru)
    avg_pool2_gru = GlobalAveragePooling1D()(x_conv2)
    max_pool2_gru = GlobalMaxPooling1D()(x_conv2)
    
    
    x_conv3 = Conv1D(conv_size, kernel_size=kernel_size1, padding='valid', kernel_initializer='he_uniform')(x_lstm)
    avg_pool1_lstm = GlobalAveragePooling1D()(x_conv3)
    max_pool1_lstm = GlobalMaxPooling1D()(x_conv3)
    
    x_conv4 = Conv1D(conv_size, kernel_size=kernel_size2, padding='valid', kernel_initializer='he_uniform')(x_lstm)
    avg_pool2_lstm = GlobalAveragePooling1D()(x_conv4)
    max_pool2_lstm = GlobalMaxPooling1D()(x_conv4)
    
    
    x = concatenate([avg_pool1_gru, max_pool1_gru, avg_pool2_gru, max_pool2_gru,
                    avg_pool1_lstm, max_pool1_lstm, avg_pool2_lstm, max_pool2_lstm])
    x = BatchNormalization()(x)
    x = Dropout(dr)(Dense(dense_units, activation='relu') (x))
    x = BatchNormalization()(x)
    x = Dropout(dr)(Dense(int(dense_units / 2), activation='relu') (x))
    x = Dense(5, activation = "sigmoid")(x)
    model = Model(inputs = inp, outputs = x)
    model.compile(loss = "binary_crossentropy", optimizer = Adam(lr = lr, decay = lr_d), metrics = ["accuracy"])
    history = model.fit(X_train, y_ohe, batch_size = 128, epochs = 20, validation_split=0.1, 
                        verbose = 1, callbacks = [check_point, early_stop])
    model = load_model(file_path)
    return model

In [None]:
model3 = build_model2(lr = 1e-4, lr_d = 0, units = 64, spatial_dr = 0.5, kernel_size1=4, kernel_size2=3, dense_units=32, dr=0.1, conv_size=32)

In [None]:
model4 = build_model2(lr = 1e-3, lr_d = 0, units = 64, spatial_dr = 0.5, kernel_size1=3, kernel_size2=3, dense_units=64, dr=0.3, conv_size=32)

In [None]:
model5 = build_model2(lr = 1e-3, lr_d = 1e-7, units = 64, spatial_dr = 0.3, kernel_size1=3, kernel_size2=3, dense_units=64, dr=0.4, conv_size=64)

In [None]:
pred1 = model1.predict(X_test, batch_size = 1024, verbose = 1)
pred = pred1
pred2 = model2.predict(X_test, batch_size = 1024, verbose = 1)
pred += pred2
pred3 = model3.predict(X_test, batch_size = 1024, verbose = 1)
pred += pred3
pred4 = model4.predict(X_test, batch_size = 1024, verbose = 1)
pred += pred4
pred5 = model5.predict(X_test, batch_size = 1024, verbose = 1)
pred += pred5

In [None]:
predictions = np.round(np.argmax(pred, axis=1)).astype(int)
sub['Sentiment'] = predictions
sub.to_csv("blend.csv", index=False)