In [None]:
# mounting google drive for reading files
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# loading the required packages
import numpy as np
import nltk
from nltk.corpus import stopwords
import re

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Loading Data and creating Labels from data

In [None]:
path = '/content/drive/MyDrive/EMOJI Classification/full_set.txt'
with open(path) as f:
  content = f.readlines()
content = [x.strip() for x in content]
sentences = [x.split("\t")[0] for x in content]
labels = [x.split("\t")[1] for x in content]
y = np.array(labels,dtype='int8')
y = 2*y - 1

# Preprocessing text data

In [None]:
# Adding Stemming Technique
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
#porter=nltk.PorterStemmer()
porter=WordNetLemmatizer()
sentence_filtered=[]
for x in sentences:
  x=x.lower()
  x= re.sub(r'[^\w]',' ',x)
  x = re.sub(r'[0-9]+','',x)
  #sentence_filtered.append(' '.join([porter.lemmatize(word) for word in x.split() if word not in stop_words]))
  sentence_filtered.append(' '.join([porter.lemmatize(word) for word in x.split()]))

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer

In [None]:
# Applying TF IDF Vector
vectorizer = CountVectorizer(analyzer='word',preprocessor=None,max_features=6000,ngram_range=(1,3))
data_features = vectorizer.fit_transform(sentence_filtered)
tfidf_transformer = TfidfTransformer()
data_mat = tfidf_transformer.fit_transform(data_features).toarray()
#Splitting data into Train and Test
np.random.seed(0)
test_index = np.append(np.random.choice((np.where(y==-1))[0], 250, replace=False), np.random.choice((np.where(y==1))[0], 250, replace=False))
train_index = list(set(range(len(labels))) - set(test_index))
train_data = data_mat[train_index,]
train_labels = y[train_index]
test_data = data_mat[test_index,]
test_labels = y[test_index]

# Logistic Regression Model

In [None]:
#Applying Logistic Regression
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(random_state=0).fit(train_data, train_labels)
preds_train = clf.predict(train_data)
preds_test = clf.predict(test_data)
## Compute errors
errs_train = np.sum((preds_train > 0.0) != (train_labels > 0.0))
errs_test = np.sum((preds_test > 0.0) != (test_labels > 0.0))
print("Training error: ", float(errs_train)/len(train_labels))
print("Test error: ", float(errs_test)/len(test_labels))

Training error:  0.0492
Test error:  0.166


In [None]:
from sklearn.metrics import classification_report
print(classification_report(test_labels,preds_test))

              precision    recall  f1-score   support

          -1       0.81      0.87      0.84       250
           1       0.86      0.80      0.83       250

    accuracy                           0.83       500
   macro avg       0.84      0.83      0.83       500
weighted avg       0.84      0.83      0.83       500



# SGD Model

In [None]:
# Applying SGD classifier
from sklearn.linear_model import SGDClassifier
clf = SGDClassifier(loss="log", penalty="none")
clf.fit(train_data, train_labels)
preds_train = clf.predict(train_data)
preds_test = clf.predict(test_data)
## Compute errors
errs_train = np.sum((preds_train > 0.0) != (train_labels > 0.0))
errs_test = np.sum((preds_test > 0.0) != (test_labels > 0.0))
print("Training error: ", float(errs_train)/len(train_labels))
print("Test error: ", float(errs_test)/len(test_labels))


Training error:  0.002
Test error:  0.15


In [None]:
# saving the vectorizer 
import pickle
filename = '/content/drive/MyDrive/EMOJI Classification/vector_data.pkl'
pickle.dump(vectorizer, open(filename, 'wb'))
# loading the vectorizer
vec = pickle.load(open( filename, 'rb' ))


In [None]:
# saving the SGD classifier model 
import pickle
filename = '/content/drive/MyDrive/EMOJI Classification/finalized_model.sav'
pickle.dump(clf, open(filename, 'wb'))

In [None]:
review = 'so bad'
inp = vec.transform([review])
# loading the model
loaded_model.predict(inp)


array([-1], dtype=int8)

In [None]:
# predicting the probability 
loaded_model.predict_proba(test_data[0:1,:])

array([[0.99654479, 0.00345521]])

In [None]:
from sklearn.metrics import classification_report
print(classification_report(test_labels,preds_test))

              precision    recall  f1-score   support

          -1       0.85      0.84      0.85       250
           1       0.85      0.86      0.85       250

    accuracy                           0.85       500
   macro avg       0.85      0.85      0.85       500
weighted avg       0.85      0.85      0.85       500



# Naive Bayes Model

In [None]:
# Applying naive bayes method
from sklearn.naive_bayes import MultinomialNB
nb_clf = MultinomialNB().fit(train_data, train_labels)
nb_preds_test = nb_clf.predict(test_data)
nb_errs_test = np.sum((nb_preds_test > 0.0) != (test_labels > 0.0))
print("Test error: ", float(nb_errs_test)/len(test_labels))


Test error:  0.158


In [None]:
from sklearn.metrics import classification_report
print(classification_report(test_labels,preds_test))

              precision    recall  f1-score   support

          -1       0.85      0.84      0.85       250
           1       0.85      0.86      0.85       250

    accuracy                           0.85       500
   macro avg       0.85      0.85      0.85       500
weighted avg       0.85      0.85      0.85       500



In [None]:
print(nb_clf.predict(vectorizer.transform(["event is awesome"])))

[1]


# SVM Model

In [None]:
# building the model using SVM
from sklearn.svm import SVC
svm = SVC()
svm.fit(train_data, train_labels)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [None]:
svm_preds_test = svm.predict(test_data)
svm_errs_test = np.sum((svm_preds_test > 0.0) != (test_labels > 0.0))
print("Test error: ", float(svm_errs_test)/len(test_labels))

Test error:  0.154


In [None]:
svm.predict(vectorizer.transform(["it is not good"]).reshape(1,-1).toarray())

array([-1], dtype=int8)

# LSTM Neural Network Model

In [None]:
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import SpatialDropout1D
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
from keras.callbacks import EarlyStopping
max_review_length = 200
tokenizer = Tokenizer(num_words=10000,  #max no. of unique words to keep
                      filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', 
                      lower=True #convert to lower case
                     )
#tokenizer.fit_on_texts(sentence_filtered)
X = tokenizer.texts_to_sequences(sentence_filtered)
X = sequence.pad_sequences(X, maxlen= max_review_length)
print('Shape of data tensor:', X.shape)
import pandas as pd
Y=pd.get_dummies(y).values
np.random.seed(0)
test_inds = np.append(np.random.choice((np.where(y==-1))[0], 250, replace=False), np.random.choice((np.where(y==1))[0], 250, replace=False))
train_inds = list(set(range(len(labels))) - set(test_inds))
train_data = X[train_inds,]
train_labels = Y[train_inds]
test_data = X[test_inds,]
test_labels = Y[test_inds]
EMBEDDING_DIM = 64
model = Sequential()
model.add(Embedding(10000, EMBEDDING_DIM, input_length=X.shape[1]))
#model.add(SpatialDropout1D(0.2))
# model.add(LSTM(250, dropout=0.2,return_sequences=True))
model.add(LSTM(100, dropout=0.2, return_sequences=True))
model.add(LSTM(50, dropout=0.2, recurrent_dropout=0.2))

model.add(Dense(2, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

Shape of data tensor: (3000, 200)
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 200, 64)           640000    
_________________________________________________________________
lstm (LSTM)                  (None, 200, 100)          66000     
_________________________________________________________________
lstm_1 (LSTM)                (None, 50)                30200     
_________________________________________________________________
dense (Dense)                (None, 2)                 102       
Total params: 736,302
Trainable params: 736,302
Non-trainable params: 0
_________________________________________________________________
None


In [None]:
epochs = 10
batch_size = 50
model.fit(train_data, train_labels, 
          epochs=epochs, 
          batch_size=batch_size,
          validation_split=0.1)
loss, acc = model.evaluate(test_data, test_labels, verbose=2,
                            batch_size=batch_size)
print(f"loss: {loss}")
print(f"Validation accuracy: {acc}")
outcome_labels = ['Negative', 'Positive']
new = ["I would not recommend this movie"]
    
seq = tokenizer.texts_to_sequences(new)
padded = sequence.pad_sequences(seq, maxlen=max_review_length)
pred = model.predict(padded)
print("Probability distribution: ", pred)
print("Is this a Positive or Negative review? ")
print(outcome_labels[np.argmax(pred)])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
10/10 - 1s - loss: 0.6936 - accuracy: 0.5000
loss: 0.6935827732086182
Validation accuracy: 0.5
Probability distribution:  [[0.5147547 0.4852453]]
Is this a Positive or Negative review? 
Negative
