## Using GloVe Wiki Gigaword 100d

In [4]:
import pandas as pd
import numpy as np
from numpy import dot
from numpy.linalg import norm
from textblob import TextBlob
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
import string
import matplotlib.pyplot as plt

## Load input transcriptions

In [None]:

df = pd.read_csv("train.csv")

In [None]:
df.head()

## Load gigaword-100 pre-trained embeddings

In [None]:
import gensim.downloader as api

# Load GloVe
word_vectors = api.load("glove-wiki-gigaword-100")

In [None]:
word_vectors.save('fstwk.d2v')

In [None]:
from gensim.models import KeyedVectors
model = KeyedVectors.load("fstwk.d2v")

In [None]:
word_vectors.get_vector("beautiful")

In [None]:
model.get_vector("apple")

## Load pre-defined business topics and associated terms

In [None]:
topiclist = pd.read_csv("TopicList.csv")

In [None]:
topiclist.head()

In [None]:
topiclist = topiclist.astype(object)

## Extract word embeddings for each of the words in Related Words column

In [None]:
df_dict = {}
for j in range(0,len(topiclist.index)):
    line = topiclist.at[j,'Related words'].replace(',','')
    line = line.translate(str.maketrans('', '', string.punctuation))
    keywords = [word.lower() for word in line.split(' ')]
    keywords = set([word for word in keywords if not word in stop_words]) 
    vw = []
    for word in keywords:
        try:
            ## try to obtain the word vector of a given word. If it doesn't exist continue
            vw.append(word_vectors.get_vector(word))
        except:
            continue
    vw = np.array(vw, dtype=np.float)
    topiclist = topiclist.astype(object)
    topiclist.at[j,'TargetVector'] = np.sum(vw, axis=0, ).tolist()
    if j == 81:
        break

In [None]:
topiclist.head()

In [None]:
len(topiclist)

## Extract Word Vector for a transcript call

In [None]:

def build_wordvector(keywords):
    keywords = set(keywords)
    vw = []
    for word in keywords:
        try:
            ## try to obtain the word vector of a given word. If it doesn't exist continue
            vw.append(word_vectors.get_vector(word))
        except:
            continue
    vw = np.array(vw, dtype=np.float)
    target_vector = np.sum(vw, axis=0, ).tolist()
    return target_vector
#     target_vector = np.sum(np.array([word_vectors.get_vector(w) for w in keywords]), axis=0).tolist()
# target_vector

In [None]:
topiclist.head()

In [None]:
df.head()

## Define the cosine similarity function 

In [None]:
def similarity( a, b ):  
    a_norm = norm(a)
    b_norm = norm(b)
    if a_norm == 0.0 or b_norm == 0.0:
        return 0.0
    else:
        return float(dot(a,b) / (a_norm * b_norm))



## Calcualte similarity score between the summer vectors of the predefined business terms and transcriptions calls

In [None]:
top_score = 0 
top_index = 0
for row in range(0, len(df.index)):
    target_vector = build_wordvector(df.at[row,'text'].split(' '))
    for i in range(0, len(topiclist.index)):
        if type(topiclist.at[i,'TargetVector']) is list:
            score = similarity(topiclist.at[i,'TargetVector'], target_vector)
            if score > top_score:
                top_score = score
                top_index = i
            else:
                continue
    df.at[row,'TopicClassification'] = topiclist.at[top_index, 'Topic']
    top_score = 0 
    top_index = 0


In [None]:
print("Total number of calls: " + str(len(df.index)))

## Topic Distribution bar chart

In [None]:
ax = df['TopicClassification'].value_counts().plot(kind='bar',
                                    figsize=(14,8),
                                    title="Distribution of Topics", color='red')
ax.set_xlabel("Topics")
ax.set_ylabel("Number of Calls")
plt.show()

## Topic Distribution pie chart

In [None]:
ax = df['TopicClassification'].value_counts().plot(kind='pie',
                                    figsize=(14,8),
                                    title="Distribution of Topics", autopct='%1.1f%%')
ax.set_xlabel("Topics")
ax.set_ylabel("Number of Calls")
plt.show()

In [None]:
gigaword_glove_100_df = df.head(50)

In [None]:
df.head()

In [None]:
gigaword_glove_100_df.to_csv('gigaword_glove_100_df.csv', index=False)

## Using Gensim Glove 6B 300D

In [None]:
from scipy import spatial
import pandas as pd
import numpy as np
import string
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
import matplotlib.pyplot as plt
from numpy.linalg import norm
from numpy import dot

In [None]:

df = pd.read_csv("train.csv")

In [None]:
topiclist = pd.read_csv("TopicList.csv")

In [None]:
filename = 'glove.6B.300d.txt'

In [None]:
from gensim.models import KeyedVectors
# load the Stanford GloVe model
# download glove 300d from https://nlp.stanford.edu/projects/glove/
filename = 'glove.6B.300d.txt.word2vec'
model = KeyedVectors.load_word2vec_format(filename, binary=False)
# calculate: (king - man) + woman = ?
result = model.most_similar(positive=['woman', 'king'], negative=['man'], topn=1)
print(result)

In [None]:
def build_wordvector(keywords):
    keywords = set(keywords)
    vw = []
    for word in keywords:
        try:
            ## try to obtain the word vector of a given word. If it doesn't exist continue
            vw.append(model.wv.word_vec(word))
        except:
            continue
    vw = np.array(vw, dtype=np.float)
    target_vector = np.sum(vw, axis=0, ).tolist()
    return target_vector

In [None]:
def similarity( a, b ):  
    a_norm = norm(a)
    b_norm = norm(b)
    if a_norm == 0.0 or b_norm == 0.0:
        return 0.0
    else:
        return float(dot(a,b) / (a_norm * b_norm))

In [None]:
topiclist = topiclist.astype(object)

In [None]:
df_dict = {}
for j in range(0,len(topiclist.index)):
    line = topiclist.at[j,'Related words'].replace(',','')
    line = line.translate(str.maketrans('', '', string.punctuation))
    keywords = [word.lower() for word in line.split(' ')]
    keywords = set([word for word in keywords if not word in stop_words]) 
    vw = []
    for word in keywords:
        try:
            ## try to obtain the word vector of a given word. If it doesn't exist continue
            vw.append(model.wv.word_vec(word))
        except:
            continue
    vw = np.array(vw, dtype=np.float)
    topiclist = topiclist.astype(object)
    topiclist.at[j,'TargetVector'] = np.sum(vw, axis=0, ).tolist()
    if j == 81:
        break

In [None]:
topiclist.head()

In [None]:
top_score = 0 
top_index = 0
for row in range(0, len(df.index)):
    target_vector = build_wordvector(df.at[row,'text'].split(' '))
    for i in range(0, len(topiclist.index)):
        if type(topiclist.at[i,'TargetVector']) is list:
            score = similarity(topiclist.at[i,'TargetVector'], target_vector)
            if score > top_score:
                top_score = score
                top_index = i
            else:
                continue
    df.at[row,'TopicClassification'] = topiclist.at[top_index, 'Topic']
    top_score = 0 
    top_index = 0


In [None]:
df.head()

In [None]:
ax = df['TopicClassification'].value_counts().plot(kind='bar',
                                    figsize=(14,8),
                                    title="Distribution of Topics", color='red')
ax.set_xlabel("Topics")
ax.set_ylabel("Number of Calls")
plt.show()

In [None]:
ax = df['TopicClassification'].value_counts().plot(kind='pie',
                                    figsize=(14,8),
                                    title="Distribution of Topics", autopct='%1.1f%%')
ax.set_xlabel("Topics")
ax.set_ylabel("Number of Calls")
plt.show()

In [None]:
wiki_glove_300_df = df.head(50)

In [None]:
wiki_glove_300_df.to_csv('wiki_glove_300_df.csv', index=False)

## FastText 

In [None]:
import pandas as pd
import numpy as np
from numpy import dot
from numpy.linalg import norm
from textblob import TextBlob
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
import string
import matplotlib.pyplot as plt
import fasttext

In [None]:

df = pd.read_csv("train.csv")

In [None]:
topiclist = pd.read_csv("TopicList.csv")

In [None]:
topiclist = topiclist.astype(object)

In [None]:
#### download the english fasttext word vectors from https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.en.zip
#### for other languages visit: https://fasttext.cc/docs/en/pretrained-vectors.html

In [None]:
pretrained_model = 'wiki.en.bin'

In [None]:
fasttext_model = fasttext.load_model(pretrained_model)

In [None]:
fasttext_model.get_word_vector("pay")

In [None]:
fasttext_model.get_nearest_neighbors("bank")

In [None]:
def build_wordvector(keywords):
    keywords = set(keywords)
    vw = []
    for word in keywords:
        try:
            ## try to obtain the word vector of a given word. If it doesn't exist continue
            vw.append(fasttext_model.get_word_vector(word))
        except:
            continue
    vw = np.array(vw, dtype=np.float)
    target_vector = np.sum(vw, axis=0, ).tolist()
    return target_vector

In [None]:
def similarity( a, b ):  
    a_norm = norm(a)
    b_norm = norm(b)
    if a_norm == 0.0 or b_norm == 0.0:
        return 0.0
    else:
        return float(dot(a,b) / (a_norm * b_norm))

In [None]:
topiclist = topiclist.astype(object)

In [None]:
df_dict = {}
for j in range(0,len(topiclist.index)):
    line = topiclist.at[j,'Related words'].replace(',','')
    line = line.translate(str.maketrans('', '', string.punctuation))
    keywords = [word.lower() for word in line.split(' ')]
    keywords = set([word for word in keywords if not word in stop_words]) 
    vw = []
    for word in keywords:
        try:
            ## try to obtain the word vector of a given word. If it doesn't exist continue
            vw.append(fasttext_model.get_word_vector(word))
        except:
            continue
    vw = np.array(vw, dtype=np.float)
    topiclist = topiclist.astype(object)
    topiclist.at[j,'TargetVector'] = np.sum(vw, axis=0, ).tolist()
    if j == 81:
        break

In [None]:
top_score = 0 
top_index = 0
for row in range(0, len(df.index)):
    target_vector = build_wordvector(df.at[row,'text'].split(' '))
    for i in range(0, len(topiclist.index)):
        if type(topiclist.at[i,'TargetVector']) is list:
            score = similarity(topiclist.at[i,'TargetVector'], target_vector)
            if score > top_score:
                top_score = score
                top_index = i
            else:
                continue
    df.at[row,'TopicClassification'] = topiclist.at[top_index, 'Topic']
    top_score = 0 
    top_index = 0


In [None]:
topiclist.head()

In [None]:
df.head(20)

In [None]:
ax = df['TopicClassification'].value_counts().plot(kind='bar',
                                    figsize=(14,8),
                                    title="Distribution of Topics", color='red')
ax.set_xlabel("Topics")
ax.set_ylabel("Number of Calls")
plt.show()

In [None]:
ax = df['TopicClassification'].value_counts().plot(kind='pie',
                                    figsize=(14,8),
                                    title="Distribution of Topics", autopct='%1.1f%%')
ax.set_xlabel("Topics")
ax.set_ylabel("Number of Calls")
plt.show()

In [None]:
fasttext_df = df.head(50)

In [None]:
fasttext_df.to_csv('fasttext_df.csv', index=False)