In [266]:
import keras
import gc
import pandas as pd
import numpy as np
import nltk
import matplotlib.pyplot as plt 
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.manifold import TSNE
from keras.models import Sequential
from keras.layers import Dense, LSTM, Bidirectional, Dropout
from keras.utils import np_utils
from keras import optimizers
from keras import regularizers
from keras.utils.training_utils import multi_gpu_model

# <span style="color:red"> Before this line are method predefined</span>

In [267]:
def clean_str(string):  
    """ 
    Tokenization/string cleaning for dataset 
    Every dataset is lower cased except 
    """  
    sens = word_tokenize(string.lower())
    sens = [word for word in sens if not word in english_stopwords]
    sens = [word for word in sens if not word in english_punctuations]
    sens = [lemmatizer.lemmatize(word) for word in sens]
    sens = [word for word in sens if word.isalpha()]
    sens = ' '.join(sens)
    return sens

In [268]:
def plot_2d(X, label):
    # only for this case!
    plt.figure()
    # plt.scatter(aa[:,0],aa[:,1])
    point_1 = []
    point_0 = []
    point_2 = []
    for i in range(len(label)):
        if label[i]== '1' or label[i]== 1:
            point_1.append(X[i])
        elif label[i]== '0' or label[i]== 0:
            point_0.append(X[i])
        else:
            point_2.append(X[i])
    point_1 = np.asarray(point_1)
    point_0 = np.asarray(point_0)
    point_2 = np.asarray(point_2)
    plt.scatter(point_1[:,0],point_1[:,1],color='red')
    plt.scatter(point_0[:,0],point_0[:,1],color='g')
    plt.scatter(point_2[:,0],point_2[:,1],color='b')
    plt.show()

In [269]:
def vis_tsne(X, label):
    ts = TSNE()
    X_lower = ts.fit_transform(X.reshape(X.shape[0], X.shape[2]),)
    plot_2d(X_lower, label)
    return X
    

# <span style="color:red"> Now we do some preprocessing</span>

In [270]:
df = pd.read_csv('../data/Tweets.csv')

In [271]:
LEARNING_RATE = 0.02
MAX_FEATURES = 2000
BATCH_SIZE = 8
EPOCHS = 100
DECAY = 2e-4  # about half each epoch

In [272]:
X = df['text']
y = df['airline_sentiment']
y.replace({'neutral':'2', 'positive':'1', 'negative':'0'}, inplace = True)

In [273]:
y.value_counts()

0    9178
2    3099
1    2363
Name: airline_sentiment, dtype: int64

In [274]:
# preprocssing, stopwords and rare words, tokenization and vectorizing
lemmatizer=WordNetLemmatizer()
english_stopwords = stopwords.words('english')
english_punctuations = [',', '.','\'s', ':', ';', '?', '(', ')', '[', ']', '&', '!', '*', '@', '#', '$', '%']
X = X.apply(clean_str)

# vectorizing using tfidf
vectorizer = CountVectorizer(ngram_range = (1,2), max_df = 0.95,min_df = 0.001, max_features = MAX_FEATURES)
X = vectorizer.fit_transform(X)

#  <span style="color:red"> For our first model, bidirectional LSTM with fine-tuning </span> 

In [275]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.2, random_state = 517)
X_train = X_train.toarray()
X_train = np.reshape(X_train,(X_train.shape[0], 1, X_train.shape[1]))
X_test = X_test.toarray()
X_test = np.reshape(X_test, (X_test.shape[0], 1, X_test.shape[1]))

y_train = np_utils.to_categorical(y_train, num_classes=3)
y_test = np_utils.to_categorical(y_test, num_classes=3)

X_train.shape

(11712, 1, 1618)

In [276]:
bi_lstm = Sequential()
bi_lstm.add(Dense(512, activation = 'relu', input_shape = (1, X_train.shape[2])))
bi_lstm.add(Dropout(0.3))
bi_lstm.add(Bidirectional(LSTM(64 ,dropout = 0.7,recurrent_dropout = 0.3, return_sequences=False)))
bi_lstm.add(Dense(64, activation = 'relu'))
bi_lstm.add(Dense(3,activation = 'softmax'))
bi_lstm = multi_gpu_model(bi_lstm)
bi_lstm.summary()

earlystopping = keras.callbacks.EarlyStopping(monitor = 'val_loss', patience = 15, verbose = 1)

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
dense_76_input (InputLayer)     (None, 1, 1618)      0                                            
__________________________________________________________________________________________________
lambda_1 (Lambda)               (None, 1, 1618)      0           dense_76_input[0][0]             
__________________________________________________________________________________________________
lambda_2 (Lambda)               (None, 1, 1618)      0           dense_76_input[0][0]             
__________________________________________________________________________________________________
sequential_35 (Sequential)      (None, 3)            1132803     lambda_1[0][0]                   
                                                                 lambda_2[0][0]                   
__________

In [277]:
bi_lstm.compile(loss='categorical_crossentropy', optimizer=optimizers.SGD(lr = LEARNING_RATE), metrics=['accuracy'])
bi_lstm.fit(X_train, y_train, epochs=EPOCHS, batch_size=BATCH_SIZE, validation_split=0.2, callbacks = [earlystopping], verbose=2)

Train on 9369 samples, validate on 2343 samples
Epoch 1/100
 - 31s - loss: 0.8973 - acc: 0.6306 - val_loss: 0.8731 - val_acc: 0.6116
Epoch 2/100
 - 19s - loss: 0.7968 - acc: 0.6454 - val_loss: 0.7547 - val_acc: 0.6765
Epoch 3/100
 - 18s - loss: 0.6971 - acc: 0.7099 - val_loss: 0.7042 - val_acc: 0.6948
Epoch 4/100
 - 19s - loss: 0.6345 - acc: 0.7416 - val_loss: 0.6350 - val_acc: 0.7311
Epoch 5/100
 - 18s - loss: 0.5824 - acc: 0.7597 - val_loss: 0.6160 - val_acc: 0.7473
Epoch 6/100
 - 19s - loss: 0.5443 - acc: 0.7818 - val_loss: 0.6137 - val_acc: 0.7520
Epoch 7/100
 - 19s - loss: 0.5069 - acc: 0.7986 - val_loss: 0.5856 - val_acc: 0.7670
Epoch 8/100
 - 19s - loss: 0.4739 - acc: 0.8099 - val_loss: 0.5777 - val_acc: 0.7665
Epoch 9/100
 - 20s - loss: 0.4480 - acc: 0.8236 - val_loss: 0.5885 - val_acc: 0.7648
Epoch 10/100
 - 20s - loss: 0.4192 - acc: 0.8325 - val_loss: 0.6018 - val_acc: 0.7725
Epoch 11/100
 - 20s - loss: 0.3919 - acc: 0.8433 - val_loss: 0.6159 - val_acc: 0.7704
Epoch 12/100
 -

<keras.callbacks.History at 0x7fcb83952f28>

In [278]:
bi_lstm.evaluate(X_test, y_test)



[0.8927651204046656, 0.7715163934426229]

#  <span style="color:red">SVM with Tf-Idf!</span>
###  Use train and test from above. We are doing based on One-hot embedding method. We will try to do GloVe later.

In [142]:
# Use train and test from above. We are doing based on TFIDF embedding method. We will try to do GloVe later.
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.2, random_state = 517)

In [143]:
from sklearn import svm, grid_search

# parameters = {'kernel':('linear', 'rbf'), 'C':[1, 10,100], 'gamma':[0.1,1,10]}
# svr = svm.SVC(class_weight = 'balanced', verbose = True)
# clf = grid_search.GridSearchCV(svr, parameters)
# clf.fit(X_train, y_train)



[LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM]

GridSearchCV(cv=None, error_score='raise',
       estimator=SVC(C=1.0, cache_size=200, class_weight='balanced', coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=True),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'C': [1, 10, 100], 'gamma': [0.1, 1, 10], 'kernel': ('linear', 'rbf')},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=0)

In [144]:
clf.best_estimator_

SVC(C=10, cache_size=200, class_weight='balanced', coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0.1, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=True)

In [145]:
clf.score(X_test, y_test)

0.764344262295082

In [147]:
clf = svm.SVC(kernel = 'rbf',gamma = 0.1, C = 10,class_weight = 'balanced', verbose = True)