### Imports

In [1]:
# !python -m pip install -r require.txt
# !python.exe -m pip install --upgrade pip

In [2]:
# !C:\Users\JainYashVija\Desktop\Projects\Learnings\data-drift-nlp\data-drift-nlp-env\Scripts\pip.exe install -r require.txt

In [3]:
# !pip show tensorflow

In [3]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Dense, LSTM,  Embedding
from sentence_transformers import SentenceTransformer, util
from sklearn.feature_extraction.text import TfidfVectorizer
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from sklearn.preprocessing import scale
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import gensim.downloader as api
import pandas as pd
import numpy as np
import re
import tensorflow as tf
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
glove_wiki = api.load("glove-wiki-gigaword-50")

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\JainYashVija\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\JainYashVija\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


### Data Preparation

In [4]:
data_reviews = pd.read_csv("../data/Reviews.csv")
data_reviews.drop_duplicates(keep="first",inplace=True)
data_reviews = data_reviews.iloc[:,[6,9]]

In [5]:
data_twitter_biden = pd.read_csv("../data/hashtag_joebiden.csv")
data_twitter_trump = pd.read_csv("../data/hashtag_donaldtrump.csv")
data_twitter = pd.concat([data_twitter_trump["tweet"],data_twitter_biden["tweet"]], axis=0, ignore_index=True)
data_twitter = data_twitter.dropna()
data_twitter = data_twitter.sample(frac=1)
data_twitter = data_twitter.reset_index()
data_twitter.drop('index',inplace=True,axis=1)

  data_twitter_biden = pd.read_csv("../data/hashtag_joebiden.csv")
  data_twitter_trump = pd.read_csv("../data/hashtag_donaldtrump.csv")


In [7]:
data_twitter.head(15)

Unnamed: 0,tweet
0,@realDonaldTrump \n@JoeBiden \n#ThePostElectio...
1,What arrogance..to think your view is the only...
2,@JimmyPatronis @60Minutes @realDonaldTrump @CB...
3,Y acabó el terror #Trump esperen los pedos que...
4,France celebrates the fall of the house of #Tr...
5,0.0
6,0.0
7,"Values, decency, respect are disappearing. Ame..."
8,@Sethrogen l made these pics of #DonaldTrump a...
9,#Biden leads in Michigan! Can he flip this key...


In [8]:
data_reviews.head(15)

Unnamed: 0,Score,Text
0,5,I have bought several of the Vitality canned d...
1,1,Product arrived labeled as Jumbo Salted Peanut...
2,4,This is a confection that has been around a fe...
3,2,If you are looking for the secret ingredient i...
4,5,Great taffy at a great price. There was a wid...
5,4,I got a wild hair for taffy and ordered this f...
6,5,This saltwater taffy had great flavors and was...
7,5,This taffy is so good. It is very soft and ch...
8,5,Right now I'm mostly just sprouting this so my...
9,5,This is a very healthy dog food. Good for thei...


sentence tokenization and data cleaning

In [9]:
def text_cleaning(data,r: list):
    lemmatizer = WordNetLemmatizer()
    corpus = []
    
    for i in range(r[0], r[1]):
        review = re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)"," ",data['Text'][i]).lower().split()
        review = [lemmatizer.lemmatize(word) for word in review if not word in stopwords.words('english')]
        review = ' '.join(review)
        corpus.append(review)

    return corpus

In [10]:
def text_cleaning_twitter(data, r: list):
    lemmatizer = WordNetLemmatizer()
    corpus = []
    
    for i in range(r[0], r[1]):
        review = re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)"," ",data_twitter.iloc[i,0]).lower().split()
        review = [lemmatizer.lemmatize(word) for word in review if not word in stopwords.words('english')]
        review = ' '.join(review)
        corpus.append(review)

    return corpus

In [7]:
def save_file(filename, data):
    filename = filename
    data = data

    with open(filename, "w") as outfile:
        outfile.write('\n'.join(data))

In [8]:
def load_file(filename):
    with open(filename) as f:
        corpus = [line for line in f.readlines()]
    return corpus

In [13]:
# corpus_train = text_cleaning(data_reviews,[0,10000])
# corpus_test = text_cleaning(data_reviews, [60000,70000])

# save_file("corpus_test.txt",corpus_test)
# save_file("corpus_train.txt",corpus_train)

In [14]:
# corpus_twitter_train = text_cleaning_twitter(data_twitter, [0,10000])
# corpus_twitter_test = text_cleaning_twitter(data_twitter, [60000,70000])

# save_file("corpus_twitter_train.txt",corpus_twitter_train)
# save_file("corpus_twitter_test.txt",corpus_twitter_test)

In [9]:
corpus_train = load_file("corpus_train.txt")
corpus_test = load_file("corpus_test.txt")
corpus_drifted_test = load_file("corpus_test_drift_ingested.txt")

corpus_twitter_train = load_file("corpus_twitter_train.txt")
corpus_twitter_test = load_file("corpus_twitter_test.txt")
corpus_twitter_drifted_test = load_file("corpus_twitter_test_drift_ingested.txt")

### Drift Ingestion

In [16]:
def get_top_n_words(corpus):
    vectorizer = TfidfVectorizer(stop_words='english', min_df=15) 
    vector = vectorizer.fit_transform(corpus)
    terms = vectorizer.get_feature_names_out()
    sums = vector.sum(axis=0)
    data = []
    for col, term in enumerate(terms):
        data.append( (term, sums[0,col] ))

    ranking = pd.DataFrame(data, columns=['term','rank'])
    ranking.sort_values('rank', ascending=False, inplace=True)
    ranking.reset_index(inplace=True)
    ranking.drop('index', axis=1, inplace=True)
    print(ranking.head(20))
    return list(ranking.iloc[:20,0])

In [17]:
def drift_ingestion(corpus_train):
    corpus = []
    with open('corpus_twitter_test.txt') as f:
        words_to_replace = get_top_n_words(corpus_train)
        for line in f.readlines():
            big_regex = re.compile('|'.join(map(re.escape, words_to_replace)))
            line = big_regex.sub("<replaced>", line)
            corpus.append(line)
            
    save_file("corpus_twitter_test_drift_ingested.txt",corpus)

In [18]:
drift_ingestion(corpus_twitter_train)

               term        rank
0             trump  978.177660
1             biden  730.413562
2          joebiden  459.071539
3       donaldtrump  266.965351
4      election2020  250.576550
5              vote  238.512410
6                la  175.999538
7         president  167.720342
8          election  166.916309
9     elections2020  164.079781
10              amp  145.634621
11  bidenharris2020  142.237255
12        trump2020  141.251326
13              usa  139.284083
14               le  122.289898
15          america  117.248508
16               en  116.794041
17              win  116.789772
18      electionday  114.624699
19              joe  110.371409


### Sentence Bert Transformer (SBert) Embedding

In [10]:
def cosine_similarity(embedding_train,embedding_test):
    cosine_scores = util.cos_sim(embedding_train, embedding_test)
    return np.mean(np.mean(np.absolute(np.array(cosine_scores)),axis=1),axis=0)

Review Data

In [11]:
bert_model = SentenceTransformer('all-MiniLM-L6-v2')
embedding_train = bert_model.encode(corpus_train, convert_to_tensor=True)
embedding_test = bert_model.encode(corpus_test, convert_to_tensor=True)
embedding_drifted_test = bert_model.encode(corpus_drifted_test, convert_to_tensor=True)

In [12]:
cosine_similarity(embedding_train,embedding_test)

0.2820735

In [13]:
cosine_similarity(embedding_train,embedding_drifted_test)

0.15698944

Twitter Data

In [23]:
bert_model = SentenceTransformer('all-MiniLM-L6-v2')
embedding_twitter_train = bert_model.encode(corpus_twitter_train, convert_to_tensor=True)
embedding_twitter_test = bert_model.encode(corpus_twitter_test, convert_to_tensor=True)
embedding_twitter_drifted_test = bert_model.encode(corpus_twitter_drifted_test, convert_to_tensor=True)

In [24]:
cosine_similarity(embedding_twitter_train,embedding_twitter_test)

0.25215048

In [25]:
cosine_similarity(embedding_twitter_train,embedding_twitter_drifted_test)

0.1552168

### Adversarial Validation Using LSTM and Keras Embedding.

In [56]:
def text_preprocessing(corpus):
    vocabulary_size = 10000
    tokenizer = Tokenizer(num_words= vocabulary_size)
    tokenizer.fit_on_texts(corpus)
    sequences = tokenizer.texts_to_sequences(corpus)
    data = pad_sequences(sequences, maxlen=200)
    return pd.DataFrame(data)

In [57]:
def create_df_with_labels(train,test):
    train['labels'] = 1
    test['labels'] = 0
    all_data = pd.concat([train, test], axis=0, ignore_index=True)

    all_data_shuffled = all_data.sample(frac=1)

    X = all_data_shuffled.drop(['labels'], axis=1)
    y = all_data_shuffled['labels']
    return X,y

In [58]:
def build_and_run_model(train,test):
    X,y = create_df_with_labels(train,test)
    callback = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=3)
    model = Sequential()
    model.add(Embedding(10000, 200, input_length=200))
    model.add(LSTM(64, dropout=0.2))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
    print('Training the RNN')
    model.fit(X,y,validation_split=0.4, epochs=100,callbacks=[callback])
    return model


Review data

In [59]:
data_train = text_preprocessing(corpus_train)
data_test = text_preprocessing(corpus_test)

model = build_and_run_model(data_train,data_test)

Training the RNN
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100


In [60]:
_, test_acc = model.evaluate(data_test.iloc[:,:-1], data_test.iloc[:,-1], verbose=0)
_, train_acc = model.evaluate(data_train.iloc[:,:-1], data_train.iloc[:,-1], verbose=0)

print(test_acc)
print(train_acc)

0.910099983215332
0.9121000170707703


Twitter data 

In [61]:
data_twitter_train = text_preprocessing(corpus_twitter_train)
data_twitter_test = text_preprocessing(corpus_twitter_test)

model = build_and_run_model(data_twitter_train,data_twitter_test)

Training the RNN
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100


In [62]:
_, test_acc_twitter = model.evaluate(data_twitter_test.iloc[:,:-1], data_twitter_test.iloc[:,-1], verbose=0)
_, train_acc_twitter = model.evaluate(data_twitter_train.iloc[:,:-1], data_twitter_train.iloc[:,-1], verbose=0)

print(test_acc_twitter)
print(train_acc_twitter)

0.8677999973297119
0.8962000012397766


### Glove Embedding

In [14]:
def get_GloVe(text, size, vectors, aggregation='mean'):
    vec = np.zeros(size).reshape((1, size)) 
    count = 0
    for word in text.split():
        try:
            vec += vectors[word].reshape((1, size)) 
            count += 1 
        except KeyError:
            continue
    if aggregation == 'mean':
        if count != 0:
            vec /= count  #get average of vector to create embedding for sentence
        return vec
    elif aggregation == 'sum':
        return vec

Review Data

In [73]:
# glove_embeddings_train = scale(np.concatenate([get_GloVe(text,50,glove_wiki) for text in corpus_train]))
# glove_embeddings_test = scale(np.concatenate([get_GloVe(text,50,glove_wiki) for text in corpus_test]))

In [74]:
# np.savetxt("glove_embeddings_train.txt",glove_embeddings_train)

In [15]:
glove_embeddings_train = np.loadtxt('glove_embeddings_train.txt')
glove_embeddings_test = np.loadtxt('glove_embeddings_test.txt')

In [16]:
cosine_similarity(glove_embeddings_train,glove_embeddings_test)

0.18863798703748552

Twitter Data

In [68]:
# glove_embeddings_twitter_train = scale(np.concatenate([get_GloVe(text,50,glove_wiki) for text in corpus_twitter_train]))
# glove_embeddings_twitter_test = scale(np.concatenate([get_GloVe(text,50,glove_wiki) for text in corpus_twitter_test]))

In [69]:
# np.savetxt("glove_embeddings_twitter_test.txt",glove_embeddings_twitter_test)

In [17]:
glove_embeddings_twitter_train = np.loadtxt('glove_embeddings_twitter_train.txt')
glove_embeddings_twitter_test = np.loadtxt('glove_embeddings_twitter_test.txt')

In [18]:
cosine_similarity(glove_embeddings_twitter_train,glove_embeddings_twitter_test)

0.2288196098694682