# Importing Libraries

In [None]:
!pip install wordcloud

In [None]:
!pip install spacy

In [None]:
# utilities
import re
import pickle
import numpy as np
import pandas as pd

# plotting
import seaborn as sns
#from wordcloud import WordCloud
import matplotlib.pyplot as plt

# nltk
import nltk
from nltk.stem import WordNetLemmatizer
nltk.download('stopwords')
from nltk.corpus import wordnet
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from nltk import pos_tag, word_tokenize
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

#nlp
import spacy
#import en_core_web_sm
#nlp = spacy.load('en_core_web_sm')

In [None]:
!pip install -q kaggle

with open('kaggle.json', 'w') as f:
  f.write('{"username":"yashgroot","key":"56590d8c3eb21dc01ac54fc1c3f39663"}')

!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 /root/.kaggle/kaggle.json
!kaggle datasets download -d kazanova/sentiment140
!unzip sentiment140.zip -d ./
# Importing the dataset
DATASET_COLUMNS  = ["sentiment", "ids", "date", "flag", "user", "text"]
DATASET_ENCODING = "ISO-8859-1"
dataset=pd.read_csv('training.1600000.processed.noemoticon.csv', encoding = DATASET_ENCODING,  names = DATASET_COLUMNS)
dataset


# Dataset

In [None]:
dataset = dataset[['sentiment','text']]
# dataset
ax = dataset.groupby('sentiment').count().plot(kind='bar', title='Distribution on basis of sentiments', legend=True)
ax.set_xticklabels(['Negative','Positive'], rotation=0)
dataset['sentiment'] = dataset['sentiment'].replace(4,1)

In [None]:
dataset_negative = dataset.head(800000)
dataset_positive = dataset.tail(800000)
dataset_positive_sampled = dataset_positive.sample(n=20000)
dataset_negative_sampled = dataset_negative.sample(n=20000)
dataset_sampled = [dataset_negative_sampled,dataset_positive_sampled]
dataset_sampled = pd.concat(dataset_sampled)
tweets, sentiment = list(dataset_sampled['text']), list(dataset_sampled['sentiment'])

# Distribution of sentiments

In [None]:
dataset_positive["word_count"] = dataset_positive['text'].apply(lambda x: len(str(x).split()))
sns.distplot(dataset_positive.word_count, kde=False, rug=True)

In [None]:
dataset_negative["word_count"] = dataset_negative['text'].apply(lambda x: len(str(x).split()))
sns.distplot(dataset_negative.word_count, kde=False, rug=True)

# Preprocessing

In [None]:
def cleaned(token):
    if token == 'u':
        return 'you'
    if token == 'r':
        return 'are'
    if token == 'some1':
        return 'someone'
    if token == 'yrs':
        return 'years'
    if token == 'hrs':
        return 'hours'
    if token == 'mins':
        return 'minutes'
    if token == 'secs':
        return 'seconds'
    if token == 'pls' or token == 'plz':
        return 'please'
    if token == '2morow':
        return 'tomorrow'
    if token == '2day':
        return 'today'
    if token == '4got' or token == '4gotten':
        return 'forget'
    if token in ['hahah', 'hahaha', 'hahahaha']:
        return 'haha'
    if token in ['lmao', 'lolz', 'rofl']:
        return 'lol'
    if token == 'goood':
        return 'good'
    if token == 'thanx' or token == 'thnx':
        return 'thanks'
    if token in ["i'm", "don't", "can't", "couldn't", "aren't", "wouldn't", "isn't", "didn't", "hadn't","doesn't", "won't", "haven't", "wasn't", "hasn't", "shouldn't", "ain't", "they've"]:
        return token.replace("'", "")
    if token == 'bday' or token == 'b-day':
        return 'birthday'
    if token == 'amp' or token == 'quot' or token == 'lt' or token == 'gt' or token == '½25' or token == 'URL':
        return ''
    return token

In [None]:
import time
t = time.time()
pre_processed_text, cleaned_token_list = pre_process(tweets,sentiment)
print(f'Text Preprocessing complete.')
print(f'Time Taken: {round(time.time()-t)} seconds')

# Wordcount visualization

In [None]:
data_neg = pre_processed_text[:20000]
plt.figure(figsize = (20,20))
wc = WordCloud(max_words = 1000 , width = 1600 , height = 800,
               collocations=False).generate(" ".join(data_neg))
plt.imshow(wc)

In [None]:
data_pos = pre_processed_text[20000:]
wc = WordCloud(max_words = 1000 , width = 1600 , height = 800,
              collocations=False).generate(" ".join(data_pos))
plt.figure(figsize = (20,20))
plt.imshow(wc)

In [None]:
dataset_sampled['pre_processed_text'] = pre_processed_text
dataset_sampled['cleaned'] = dataset_sampled.pre_processed_text.apply(nlp)
dataset_sampled = dataset_sampled.replace({'sentiment': 0}, 'negative')
dataset_sampled = dataset_sampled.replace({'sentiment': 1}, 'positive')
dataset_sampled
# dataset_sampled.info(memory_usage="deep")

In [None]:
corpus = st.CorpusFromParsedDocuments(dataset_sampled, category_col='sentiment', parsed_col='cleaned').build()

In [None]:
html = st.produce_scattertext_explorer(corpus,
                                       category='positive',
                                       category_name='positive',
                                       not_category_name='negative',
                                       minimum_term_frequency=5,
                                       width_in_pixels=1000,
                                       transform=st.Scalers.percentile,
                                       metadata=dataset_sampled['text'])
file_name = 'ScattertextLog.html'
with open(file_name, 'wb') as fp: 
    fp.write(html.encode('utf-8'))


In [None]:
import IPython
IPython.display.HTML(filename='/content/ScattertextLog.html')

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# Multinomial Naive Bayes

In [None]:
t1  = time.time()

from sklearn.model_selection import train_test_split
dataset_sampled = dataset_sampled.replace({'sentiment': 'negative'}, 0)
dataset_sampled = dataset_sampled.replace({'sentiment': 'positive'}, 1)
X_train, X_test, y_train, y_test = train_test_split(dataset_sampled['pre_processed_text'],dataset_sampled['sentiment'] , test_size=0.30, random_state=42)
print(X_train, X_test, y_train, y_test)
# dataset_sampled

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
vectorizer = TfidfVectorizer(min_df=10,ngram_range=(1,3)) 
vectorizer.fit(X_train)          
x_tr=vectorizer.transform(X_train)
x_te=vectorizer.transform(X_test)

In [None]:
from sklearn.metrics import confusion_matrix, classification_report 
from sklearn.metrics import roc_curve, auc
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.model_selection import GridSearchCV
from sklearn.svm import LinearSVC
from itertools import chain
from sklearn.linear_model import LogisticRegression

In [None]:
def model_Evaluate(model, X_test_data, Y_test_data):
    
    # Predict values for Test dataset
    y_pred_cont = model.predict(X_test_data)
    y_pred_bin = []
    for i in y_pred_cont:
      if 1- i <= i - 0:
        y_pred_bin.append(1)
      else : 
        y_pred_bin.append(0)
    # Print the evaluation metrics for the dataset.
    print(classification_report(Y_test_data, y_pred_bin))
    
    # Compute and plot the Confusion matrix
    cf_matrix = confusion_matrix(Y_test_data, y_pred_bin)

    categories  = ['Negative','Positive']
    group_names = ['True Neg','False Pos', 'False Neg','True Pos']
    group_percentages = ['{0:.2%}'.format(value) for value in cf_matrix.flatten() / np.sum(cf_matrix)]

    labels = [f'{v1}\n{v2}' for v1, v2 in zip(group_names,group_percentages)]
    labels = np.asarray(labels).reshape(2,2)

    sns.heatmap(cf_matrix, annot = labels, cmap = 'Blues',fmt = '',xticklabels = categories, yticklabels = categories)

    plt.xlabel("Predicted values", fontdict = {'size':14}, labelpad = 10)
    plt.ylabel("Actual values"   , fontdict = {'size':14}, labelpad = 10)
    plt.title ("Confusion Matrix", fontdict = {'size':18}, pad = 20)

In [None]:
model = MultinomialNB()  
parameters = {'alpha':[0.00001, 0.00005, 0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 5,
10, 50, 100]}
clf = GridSearchCV(model, parameters, cv=10,scoring='roc_auc',return_train_score=True)
clf.fit(x_tr, y_train)

In [None]:
train_auc= results['mean_train_score'].values  #extracting the auc scores 
cv_auc = results['mean_test_score'].values

In [None]:
a1=[]
for i in parameters.values():
    a1.append(i)
alphas = list(chain.from_iterable(a1))

In [None]:
plt.plot(alphas, train_auc, label='Train AUC')
plt.plot(alphas, cv_auc, label='CV AUC')
plt.scatter(alphas, train_auc, label='Train AUC points')
plt.scatter(alphas, cv_auc, label='CV AUC points')

plt.legend()
plt.xlabel("Alpha: hyperparameter")
plt.ylabel("AUC")
plt.title("Hyper parameter Vs AUC plot")  
plt.grid()
plt.show()

In [None]:
bestparam=clf.best_params_['alpha']   #extracting the best hyperparameter
print("The best Alpha=",bestparam)

In [None]:
mul_model = MultinomialNB(alpha=bestparam) #Building a Naive Bayes model with the best alpha
mul_model.fit(x_tr,y_train)

In [None]:
y_train_pred = mul_model.predict_proba(x_tr)[:,1]  #Prediction using the model(log probability of each class)
y_test_pred = mul_model.predict_proba(x_te)[:,1]
train_fpr, train_tpr, tr_thresholds = roc_curve(y_train, y_train_pred)
test_fpr, test_tpr, te_thresholds = roc_curve(y_test, y_test_pred)   
plt.plot(train_fpr, train_tpr, label="train AUC ="+str(auc(train_fpr, train_tpr)))
plt.plot(test_fpr, test_tpr, label="test AUC ="+str(auc(test_fpr, test_tpr)))
plt.legend()
plt.title("AUC PLOTS")             #Plotting train and test AUC 
plt.grid()
plt.show()

In [None]:
trauc=round(auc(train_fpr, train_tpr),3)
teauc=round(auc(test_fpr, test_tpr),3)
print('Train AUC=',trauc)
print('Test AUC=',teauc)

In [None]:
def find_best_threshold(threshould, fpr, tpr):
    t = threshould[np.argmax(tpr*(1-fpr))]      #finding the best threashold 
    print("the maximum value of tpr*(1-fpr)", max(tpr*(1-fpr)), "for threshold", np.round(t,3))
    return t

def predict_with_best_t(proba, threshould):
    predictions = []
    for i in proba:
        if i>=threshould:
            predictions.append(1)
        else:                                 #building a confusion matrix with the best threashold 
            predictions.append(0)
    return predictions

In [None]:
best_t = find_best_threshold(tr_thresholds, train_fpr, train_tpr)
TRCM=confusion_matrix(y_train, predict_with_best_t(y_train_pred, best_t))
TECM=confusion_matrix(y_test, predict_with_best_t(y_test_pred, best_t))

def CM(x,y):
    labels = ['TN','FP','FN','TP']
    group_counts = ["{0:0.0f}".format(value) for value in x.flatten()]
                    
    labels = [f"{v1}\n{v2}" for v1, v2 in
    zip(labels,group_counts)]
    labels = np.asarray(labels).reshape(2,2)       #Building a design for the confusion matrix
    sns.heatmap(x, annot=labels, fmt='', cmap='BuPu')
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.title(y)
    plt.plot()

In [None]:
CM(TRCM,'Train Confusion Matrix')

In [None]:
CM(TECM,'Test Confusion Matrix')

In [None]:
print("Train accuracy = ",(TRCM[0,0]+TRCM[1,1])/np.sum(TRCM)*100)   
print("Test accuracy = ",(TECM[0,0]+TECM[1,1])/np.sum(TECM)*100)
print(f'Time Taken by Naive Bayes Model After Preprocessing: {round(time.time()-t1)} seconds')

# Logistic Regression

In [None]:
LRmodel = LogisticRegression(C = 2, max_iter = 1000, n_jobs=-1)
LRmodel.fit(x_tr, y_train)
model_Evaluate(LRmodel, x_te, y_test)

# LSTM

In [None]:
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip glove.6B.zip

In [None]:
def read_glove_vecs(glove_file):
    with open(glove_file, 'r', encoding="utf8") as f:
        words = set()
        word_to_vec_map = {}
        for line in f:
            line = line.strip().split()
            curr_word = line[0]
            words.add(curr_word)
            word_to_vec_map[curr_word] = np.array(line[1:], dtype=np.float64)
        
        i = 1
        words_to_index = {}
        index_to_words = {}
        for w in sorted(words):
            words_to_index[w] = i
            index_to_words[i] = w
            i = i + 1
    return words_to_index, index_to_words, word_to_vec_map

In [None]:
word_to_index, index_to_word, word_to_vec_map = read_glove_vecs('glove.6B.50d.txt')

In [None]:
def cosine_similarity(u, v):
    dot = np.dot(u, v)
    norm_u = np.sqrt(np.sum(u**2))
    norm_v = np.sqrt(np.sum(v**2))
    cosine_similarity = dot / (norm_u * norm_v)
    return cosine_similarity

In [None]:
start_time = time.time()

unks = []
UNKS = []

# This function will act as a "last resort" in order to try and find the word
# in the words embedding layer. It will basically eliminate contiguously occuring
# instances of a similar character
def cleared(word):
    res = ""
    prev = None
    for char in word:
        if char == prev: continue
        prev = char
        res += char
    return res


def sentence_to_indices(sentence_words, word_to_index, max_len, i):
    global X, Y
    sentence_indices = []
    for j, w in enumerate(sentence_words):
        try:
            index = word_to_index[w]
        except:
            UNKS.append(w)
            w = cleared(w)
            try:
                index = word_to_index[w]
            except:
                index = word_to_index['unk']
                unks.append(w)
        X[i, j] = index

        
# Here we will utilize the already computed 'cleaned_tokens_list' variable
   
print('Removed Noise, CPU Time:', time.time() - start_time)
start_time = time.time()

list_len = [len(i) for i, j in cleaned_token_list]
max_len = max(list_len)
print('max_len:', max_len)

X = np.zeros((len(cleaned_token_list), max_len))
Y = np.zeros((len(cleaned_token_list), ))

for i, tk_lb in enumerate(cleaned_token_list):
    tokens, label = tk_lb
    sentence_to_indices(tokens, word_to_index, max_len, i)
    Y[i] = label
    
print('Data Prepared for model, CPU Time:', time.time() - start_time)


print(X[:5])
print(Y[:5])

In [None]:
import keras
from keras import Sequential
from keras.models import Model
from keras.layers import Dense, Dropout, LSTM, Bidirectional
from keras.layers.embeddings import Embedding
from sklearn.model_selection import train_test_split

In [None]:
def pretrained_embedding_layer(word_to_vec_map, word_to_index, max_len):
    vocab_len = len(word_to_index) + 1
    emb_dim = word_to_vec_map["unk"].shape[0] #50
    
    emb_matrix = np.zeros((vocab_len, emb_dim))
    
    for word, idx in word_to_index.items():
        emb_matrix[idx, :] = word_to_vec_map[word]
        
    embedding_layer = Embedding(vocab_len, emb_dim, trainable=False, input_shape=(max_len,))
    embedding_layer.build((None,))
    embedding_layer.set_weights([emb_matrix])
    
    return embedding_layer

In [None]:
LSTM_model = Sequential()

LSTM_model.add(pretrained_embedding_layer(word_to_vec_map, word_to_index, max_len))
LSTM_model.add(Bidirectional(LSTM(units=128, return_sequences=True)))
LSTM_model.add(Bidirectional(LSTM(units=128, return_sequences=False)))
LSTM_model.add(Dense(units=1, activation='sigmoid'))

LSTM_model.summary()

In [None]:
LSTM_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
X_train_data, X_test_data, Y_train_data, Y_test_data = train_test_split(X, Y, test_size=0.2, random_state=0, stratify=Y)

In [None]:
LSTM_model.fit(X_train_data, Y_train_data, validation_data=(X_test_data, Y_test_data), epochs = 20, batch_size = 128, shuffle=True)

In [None]:
def plot_acc_loss(history):

    acc = history.history['accuracy']
    val_acc = history.history['val_accuracy']
    loss = history.history['loss']
    val_loss = history.history['val_loss']
    epochs = range(1, len(acc) + 1)

    plt.plot(epochs, acc, 'bo', label = 'Training Accuracy')
    plt.plot(epochs, val_acc, 'r', label = 'Validation Accuracy')
    plt.title('Training and Validation Accuracy')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend()

    plt.figure()
    plt.plot(epochs, loss, 'bo', label = 'Training Loss')
    plt.plot(epochs, val_loss, 'r', label = 'Validation Loss')
    plt.title('Training and Validation Loss')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend()
    plt.show()

In [None]:
plot_acc_loss(LSTM_model.history)

In [None]:
unk = word_to_index['unk']

n_unk_words = 0

for x in X:
    for y in x:
        if y == unk:
            n_unk_words += 1

n_unk_words

In [None]:
from collections import Counter
Counter(unks).most_common(50)

In [None]:
keras.backend.clear_session()

model_clean_data = Sequential()

model_clean_data.add(pretrained_embedding_layer(word_to_vec_map, word_to_index, max_len))
model_clean_data.add(Bidirectional(LSTM(units=128, return_sequences=True)))
model_clean_data.add(Bidirectional(LSTM(units=128, return_sequences=False)))
model_clean_data.add(Dense(units=1, activation='sigmoid'))

model_clean_data.summary()

In [None]:
model_clean_data.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
X_train_data, X_test_data, Y_train_data, Y_test_data = train_test_split(X, Y, test_size=0.2, random_state=0, stratify=Y)

In [None]:
model_clean_data.fit(X_train_data, Y_train_data, validation_data=(X_test_data, Y_test_data), epochs = 10, batch_size = 128, shuffle=True)

In [None]:
plot_acc_loss(model_clean_data.history)

In [None]:
model_Evaluate(model_clean_data, X_test_data, Y_test_data)