In [None]:
import pandas as pd
import numpy as np
import pandasql as pf
import re
from nltk import word_tokenize
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

from keras.models import Sequential, Model
from keras.layers import Dense, Dropout, LSTM, Embedding
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from nltk.corpus import stopwords

from keras.layers import SimpleRNN,LSTM,Conv1D,MaxPooling1D,Dropout, Input, Bidirectional, Activation, Flatten
from keras import regularizers
from keras.layers import BatchNormalization
from keras import optimizers
from keras import initializers
from keras.utils.vis_utils import plot_model

from keras.callbacks import *
from keras import backend as K
import keras 

import tensorflow as tf

from keras.callbacks import *
from keras.optimizers import Adam

In [None]:
df = pd.read_csv('')

Removing records with no subs

In [None]:
print('Before removing "no subtitles", # records =',len(df))

In [None]:
df = df[df.transcript != "Nosubtitles"]


In [None]:
print('after removing "no subtitles", # records =',len(df))

Dropping records with null values in our target variable

In [None]:
df = df.dropna(subset =['Course'])

In [None]:
len(df)

In [None]:
df['Course'].value_counts()

In [None]:
test = df.groupby("Course").filter(lambda g: g.Course.size >= 70)

In [None]:
test['Course'].value_counts()

In [None]:
dfsub = test.groupby('Course', group_keys=False).apply(lambda x: x.sample(min(len(x), 10 )))

Selecting only ten records for each course

In [None]:
dfsub['Course'].value_counts()

In [None]:
dfsub.to_csv('')

In [None]:
df = dfsub

Creating X and Y

In [None]:
X = df[['transcript']]
y = df[['Course']]
y_course = df[['Course']]

### Ordinalizing the target variable

In [None]:
from sklearn.preprocessing import OrdinalEncoder
encoder = OrdinalEncoder() 
y = pd.DataFrame(encoder.fit_transform(y), columns=['ordinal'])
y = y.astype(int)

In [None]:
y['ordinal'].unique()

### Removing Stop Words from transcript

In [None]:
#Define a pattern

pat1= '#[^ ]+'
pat2 = 'www.[^ ]+'
pat3 = '@[^ ]+'
pat4 = '[0-9]+'
negations_dic = {"isn't":"is not", "aren't":"are not", "wasn't":"was not", "weren't":"were not",   
                "haven't":"have not","hasn't":"has not","hadn't":"had not","won't":"will not",
                "wouldn't":"would not", "don't":"do not", "doesn't":"does not","didn't":"did not",
                "can't":"can not","couldn't":"could not","shouldn't":"should not","mightn't":"might not",
                "mustn't":"must not"}


pattern = '|'.join((pat1,pat2,pat3,pat4))
neg_pattern = re.compile(r'\b(' + '|'.join(negations_dic.keys()) + r')\b')

In [None]:
neg_pattern

In [None]:
stop_words = stopwords.words('english')
clean_subs = []


for t in X['transcript']:
    t.lower()
    t = re.sub(pattern,'',t)
    t = neg_pattern.sub(lambda X: negations_dic[X.group()], t)
    t = word_tokenize(t)
    t = [X for X in t if len(X) >1]
    t = [X for X in t if X not in stop_words]
    t = [X for X in t if X.isalpha()]
    t = " ".join(t)
    t = re.sub("n't","not",t)
    t = re.sub("'s","is",t)
    clean_subs.append(t)

In [None]:
X = pd.DataFrame(clean_subs,columns = ['transcript'])

### Tokenizing and padding sequences

In [None]:
X.head(10)

In [None]:
tk = Tokenizer(num_words=300)
tk.fit_on_texts(X['transcript'])

df_tok = tk.texts_to_sequences(X['transcript'])
#X_test_tok = tk.texts_to_sequences(X_test)

max_len =300
df_pad = pad_sequences(df_tok, maxlen=max_len, padding = 'post')
#X_test_pad = pad_sequences(X_test_tok, maxlen=max_len)

In [None]:
df_pad[0:10]

In [None]:
unique_vocab = len(tk.word_index)
print(unique_vocab)

In [None]:
len(tk.word_index)

In [None]:
print(tk.word_index)

### training pre-trained word embedding library with New set of words in data

In [None]:
words_intext = []
for word, i in tk.word_index.items():
    words_intext.append(str(word))

In [None]:
print(words_intext)

In [None]:
type(words_intext)

In [None]:
#pip install gensim

In [None]:
from gensim.models import Word2Vec
word2vec_model = Word2Vec(size = 300, window=5,
min_count = 1)
# help(word2vec_model.build_vocab)
word2vec_model.build_vocab(words_intext)
word2vec_model.intersect_word2vec_format('C:/Users/*****/GoogleNews-vectors-negative300.bin', lockf=1.0, binary=True)


In [None]:
word2vec_model.train(words_intext, total_examples=1, epochs = 5)

In [None]:
word2vec_model.wv.vocab

In [None]:
word2vec_model['tableau']

### Creating the embeddings

In [None]:
embedding_dim = 300

embedding_matrix = np.zeros((unique_vocab+1, embedding_dim)) # intial embedding matrix with zeros \
                                                             #with dim (# of token word , # of features)
# Now get the feature for token words

for word, i in tk.word_index.items():      
    if word in word2vec_model.wv.vocab:
        embedding_matrix[i] = word2vec_model[word]    

In [None]:
embedding_matrix.shape

### Doing cross join on padded dataset

In [None]:
df_pad_df = pd.DataFrame(df_pad)

In [None]:
df_merged = df_pad_df[df_pad_df.columns[:]].apply(
    lambda x: ','.join(x.astype(str)),
    axis=1
)

In [None]:
df_merged = pd.merge(pd.DataFrame(df_merged),y, left_index=True, right_index=True)

In [None]:
dfcj = pf.sqldf('''select * from df_merged a,df_merged b''')

In [None]:
dfcj['similar'] = np.where(dfcj.iloc[:,1] == dfcj.iloc[:,3] , 1, 0)

In [None]:
dfcj = dfcj.drop(['ordinal'], axis = 1)

### Create train and test

In [None]:
X_train, X_test, y_train, y_test = train_test_split(dfcj.iloc[:,0:2], dfcj['similar'], test_size=0.3, random_state=42,stratify=dfcj['similar'])

In [None]:
x1_train = X_train.iloc[:,0].str.split(',', expand=True).values
x2_train = X_train.iloc[:,1].str.split(',', expand=True).values
x1_test = X_test.iloc[:,0].str.split(',', expand=True).values
x2_test = X_test.iloc[:,1].str.split(',', expand=True).values

In [None]:
print(x1_train.shape)
print(x2_train.shape)
print(x1_test.shape)
print(x2_test.shape)
print(y_train.shape)
print(y_test.shape)

In [None]:
x1_train = x1_train.astype(int)
x2_train = x2_train.astype(int)
x1_test = x1_test.astype(int)
x2_test = x2_test.astype(int)

### LSTM Siamese Network

In [None]:
emb = Embedding(input_dim=unique_vocab+1, output_dim=embedding_dim, input_length=max_len,
                    weights=[embedding_matrix],trainable=False)

In [None]:
input_q1 = Input(shape=(max_len,))
e1 = emb(input_q1)
layer1_1 = Bidirectional(LSTM(30, return_sequences=True))(e1)
x1 = Bidirectional(LSTM(10))(layer1_1)                              
                      
input_q2 = Input(shape=(max_len,))
e2 = emb(input_q2)
layer1_2 = Bidirectional(LSTM(30, return_sequences=True))(e2)
x2 = Bidirectional(LSTM(10))(layer1_2)  

In [None]:
merged = tf.keras.layers.Lambda(function=mhd, output_shape=lambda x: x[0],
name='L1_distance')([x1, x2])
preds = tf.keras.layers.Dense(1, activation='sigmoid')(merged)
model = tf.keras.Model(inputs=[input_q1, input_q2], outputs=preds)

In [None]:
model.summary()

In [None]:
model.compile(loss="mean_squared_error",optimizer="adam",metrics=['accuracy'])

In [None]:
history = model.fit([x1_train,x2_train],
                    y_train,
                    epochs=10,
                    validation_split=0.1
                   )

In [None]:
y_pred = model.predict([x1_test,x2_test])

In [None]:
results = model.evaluate([x1_test,x2_test],y_test,batch_size=50)

In [None]:
from matplotlib import pyplot

pyplot.plot(history.history['loss'])
pyplot.plot(history.history['val_loss'])
pyplot.title('model train vs validation loss')
pyplot.ylabel('loss')
pyplot.xlabel('epoch')
pyplot.legend(['train', 'validation'])
pyplot.show()

#### Saving the model

In [None]:
import os.path
if os.path.isfile('') is False:
    model.save('')