In [1]:
import pandas as pd
import json
import os
import numpy as np
import torch
import unicodedata
import re
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,confusion_matrix
import time
import math


device = torch.device("cpu")


In [4]:
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )

# Lowercase, trim, and remove non-letter characters


def normalizeString(s):
    s = unicodeToAscii(s.lower().strip())
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
    return s

group1 = ["Ticketing"]
group2 = ["Sales"]
group3 = ["Digital Marketing","Customer service","Marketing"]
group3_class_index = range(len(group3))
group3_class_dict = dict(zip(group3, group3_class_index))
group4 = [x for x in classes[4:]]
group4_class_index = range(len(group4))
group4_class_dict = dict(zip(group4, group4_class_index))

In [18]:
def load_data(device):
        directory_in_str = "/home/tanmay/Documents/cv/spotmentor/machine-learning-assessment/data/docs"
        dict_data = []
        for file in os.listdir(directory_in_str):
                filename = file
                if filename.endswith(".json"):
                        full_path = os.path.join(directory_in_str, filename)
                        with open(full_path) as f:
                                data = json.load(f)
                                temp_data = {}
                                temp_data["description"]=data["jd_information"]["description"]
                                temp_data["id"] = int(data["_id"])
                                dict_data.append(temp_data)

        df = pd.DataFrame.from_dict(dict_data, orient='columns')
        df_dep = pd.read_csv('../data/document_departments.csv')
        df_dep.columns=["id","department"]
        classes_ = df_dep["department"].unique()
        df_dep["department_new"] = df_dep["department"].apply(check_exists,classes=classes_)
        full_table= df.merge(df_dep,on='id',how='left')
        classes_ = df_dep["department_new"].unique()
        te = df_dep["department_new"].value_counts()
        print(te)
        dc = te.to_dict()
        no = full_table.shape[0]
        weight_list = []
        #for it in classes_:
        #    weight_list.append(1/dc[it])
        weight_tensor = torch.tensor(weight_list,dtype = torch.float ,device=device)
        return full_table, classes_, weight_tensor

In [14]:
def load_word_emb(file_name):
        script_dir = os.getcwd()
        abs_file_path = os.path.join(script_dir, file_name)
        print(('Loading word embedding from %s'%file_name))
        ret = {}
        with open(abs_file_path) as inf:
                for idx, line in enumerate(inf):
                        if (idx >= 10000):
                                break
                        info = line.strip().split(' ')
                        if info[0].lower() not in ret:
                                ret[info[0]] = np.array([float(x) for x in info[1:]])
        return ret

In [5]:
def embeddedTensorFromSentence(sentence,device,word_emb,N_word):
        desc_tokens = sentence.split(" ")
        emb_tokens = []
        #print(len(desc_tokens))
        for token in desc_tokens:
                val = word_emb.get(token, np.zeros(N_word, dtype=np.float32))
                emb_tokens.append(torch.tensor(val,dtype = torch.float ,device=device).view(1,1,N_word))
        #return emb_tokens
        return emb_tokens

def check_exists(dep,classes):
        if( dep in group1):
                return "Ticketing"
        elif(dep in group2):
                return "Sales"
        elif(dep in group3):
                return "group3"
        else:
                return "group4"

In [39]:
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(input_size, 1)
        self.gru = nn.GRU(input_size, hidden_size)

    def forward(self, input, hidden):
        embedded = input
        output = embedded
        output, hidden = self.gru(output, hidden)
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)


class DecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size):
        super(DecoderRNN, self).__init__()
        self.hidden_size = hidden_size

        #self.embedding = nn.Embedding(output_size, hidden_size)
        #self.gru = nn.GRU(hidden_size, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, hidden):
        #import pdb;pdb.set_trace();
        #output = self.embedding(input).view(1, 1, -1)
        output = F.relu(hidden)
        #output, hidden = self.gru(output, hidden)
        output = self.softmax(self.out(output[0]))
        return output
    
    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)


In [8]:
def init_weights(m):
    if type(m) == nn.Linear:
        torch.nn.init.xavier_uniform(m.weight)
        m.bias.data.fill_(0.01)

def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))

In [9]:
N_word=100
B_word=6
hidden_size = 256
max_length = 1000
SOS_token = 0
CLASS_size = 6

In [19]:
word_emb = load_word_emb('../glove/glove.%dB.%dd.txt'%(B_word,N_word))
full_table, classes_, weight_tensor = load_data(device)
train_df, test_df = train_test_split(full_table, test_size=0.2, random_state=42)
print(train_df.department_new.value_counts())
print(test_df.department_new.value_counts())
CLASS_size = len(classes_)
class_index = range(CLASS_size)
class_dict = dict(zip(classes_, class_index))

Loading word embedding from ../glove/glove.6B.100d.txt
Ticketing    270
Name: department_new, dtype: int64
Ticketing    212
Name: department_new, dtype: int64
Ticketing    58
Name: department_new, dtype: int64


In [45]:
def train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, max_length=max_length):
        encoder_hidden = encoder.initHidden()
        input_length = len(input_tensor)
        encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)
        for ei in range(input_length):
                encoder_output, encoder_hidden = encoder(input_tensor[ei],encoder_hidden)
                encoder_outputs[ei] += encoder_output[0, 0]
        decoder_hidden = encoder_hidden
        decoder_output= decoder(decoder_hidden)
        #decoder_output, decoder_hidden, decoder_attention = decoder(decoder_hidden, encoder_outputs)
        topv, topi = decoder_output.topk(1)
        decoder_input = topi.squeeze().detach()
        loss = criterion(decoder_output, torch.max(target_tensor, 1)[1])
        loss.backward()
        encoder_optimizer.step()
        decoder_optimizer.step()
        return loss.item()


In [28]:
def trainIters(encoder, decoder,data_df, n_iters,class_dict, print_every=1000, plot_every=100, learning_rate=0.05):
        start = time.time()
        plot_losses = []
        print_loss_total = 0  # Reset every print_every
        plot_loss_total = 0  # Reset every plot_every
        encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate)
        decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate)
        criterion = nn.NLLLoss()#weight = weight_tensor)
        for iter in range(1, n_iters + 1):
                #print(iter)
                sentence = train_df.iloc[iter - 1]["description"]
                sentence = normalizeString(sentence)
                input_tensor = embeddedTensorFromSentence(sentence,device,word_emb,N_word)
                target_class = data_df.iloc[iter - 1]["department_new"]
                class_index = []
                for i in range(CLASS_size):
                        class_index.append(0)
                class_index[class_dict[target_class]] = 1
                #import pdb; pdb.set_trace();
                #print(class_index)
                target_tensor = torch.tensor(class_index,dtype = torch.long ,device=device).view(1,CLASS_size)
                loss = train(input_tensor, target_tensor, encoder,decoder, encoder_optimizer, decoder_optimizer, criterion)
                print_loss_total += loss
                plot_loss_total += loss
                if iter % print_every == 0:
                        print_loss_avg = print_loss_total / print_every
                        print_loss_total = 0
                        print('%s (%d %d%%) %.4f' % (timeSince(start, iter / n_iters),
                                                 iter, iter / n_iters * 100, print_loss_avg))

                if iter % plot_every == 0:
                        plot_loss_avg = plot_loss_total / plot_every
                        plot_losses.append(plot_loss_avg)
                        plot_loss_total = 0


In [29]:
def evaluate(encoder, decoder, input_tensor, max_length, device):
        with torch.no_grad():
                input_length = len(input_tensor)
                encoder_hidden = encoder.initHidden()

                encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)
                for ei in range(input_length):
                        encoder_output, encoder_hidden = encoder(input_tensor[ei],encoder_hidden)
                        encoder_outputs[ei] += encoder_output[0, 0]
                decoder_hidden = encoder_hidden
                decoder_output, decoder_hidden = decoder(decoder_hidden)
                topv, topi = decoder_output.topk(1)

        return decoder_output

In [30]:
def evaluateTest(encoder,decoder,test_df,class_dict):
        test_size = test_df.shape[0]
        y_true = []
        y_pred = []
        for iter in range(0, test_size + 1):
                sentence = test_df.iloc[iter - 1]["description"]
                sentence = normalizeString(sentence)
                input_tensor = embeddedTensorFromSentence(sentence,device,word_emb,N_word)
                target_class = test_df.iloc[iter - 1]["department_new"]
                class_index = []
                target_index = class_dict[target_class]
                #print(target_index)
                y_true.append(target_index)
                output = evaluate(encoder, decoder, input_tensor,max_length,device)
                topv, topi = output.topk(1)
                y_pred.append(topi.numpy()[0][0])
        #import pdb; pdb.set_trace();
        test_df["pred_department"] = pd.Series(y_pred)
        cnf_matrix = confusion_matrix(y_true, y_pred)
        print("Accuarcy")
        print(accuracy_score(y_true, y_pred))
        print(cnf_matrix)
        return(test_df)

In [47]:
encoder = EncoderRNN(N_word, hidden_size).to(device)
encoder.apply(init_weights)
decoder = DecoderRNN(hidden_size, CLASS_size).to(device)
decoder.apply(init_weights)
n_iterations = train_df.shape[0]
trainIters(encoder, decoder, train_df, n_iterations,class_dict, print_every=50, plot_every=10)
print(classes_)
pred_df = evaluateTest(encoder,decoder,test_df)
ticket_df = pred_df[pred_df["pred_department"]=="Ticketing"]
sales_df = pred_df[pred_df["pred_department"]=="Sales"]

  This is separate from the ipykernel package so we can avoid doing imports until


0m 18s (- 5m 32s) (50 5%) 1.1217
0m 30s (- 4m 10s) (100 10%) 10.7655
0m 40s (- 3m 29s) (150 16%) 5.1842
0m 53s (- 3m 13s) (200 21%) 5.6205
1m 6s (- 3m 0s) (250 26%) 15.7907
1m 20s (- 2m 49s) (300 32%) 40.9089
1m 46s (- 2m 55s) (350 37%) 71.0659
2m 8s (- 2m 50s) (400 43%) 32.6992
2m 33s (- 2m 43s) (450 48%) 32.6481
2m 58s (- 2m 32s) (500 53%) 99.9666
3m 17s (- 2m 15s) (550 59%) 57.2222
3m 26s (- 1m 53s) (600 64%) 23.3089
3m 41s (- 1m 34s) (650 69%) 46.8569
3m 54s (- 1m 16s) (700 75%) 117.3137
4m 6s (- 0m 58s) (750 80%) 73.3028
4m 19s (- 0m 41s) (800 86%) 42.3743
4m 30s (- 0m 25s) (850 91%) 144.6995
4m 43s (- 0m 9s) (900 96%) 172.3737


NameError: name 'showPlot' is not defined

In [None]:
encoder_group3 = EncoderRNN(N_word, hidden_size).to(device)
encoder_group3.apply(init_weights)
decoder_group3 = DecoderRNN(hidden_size, CLASS_size).to(device)
decoder_group3.apply(init_weights)
group3_train_df = train_df[train_df["department_new"]=="group3"]
group3_test_df = pred_df[test_df["pred_department"]=="group3"]
n_iterations = train_df.shape[0]
trainIters(encoder, decoder, group3_train_df, "department", n_iterations, print_every=50, plot_every=10)
group3_pred_df = evaluateTest(encoder,decoder, group3_test_df,"department")

In [None]:
encoder_group4 = EncoderRNN(N_word, hidden_size).to(device)
encoder_group4.apply(init_weights)
decoder_group4 = DecoderRNN(hidden_size, CLASS_size).to(device)
decoder_group4.apply(init_weights)
group4_train_df = train_df[train_df["department_new"]=="group4"]
group4_test_df = pred_df[test_df["department_new"]=="group4"]
n_iterations = train_df.shape[0]
trainIters(encoder, decoder, group4_train_df, "department",group4_class_dict, n_iterations, print_every=50, plot_every=10)
group4_pred_df = evaluateTest(encoder,decoder, group4_test_df,"department")

In [None]:
final_test = ticket_df.append(sales_df)
final_test = final_test.append(group3_test_df)
final_test = final_test.append(group4_test_df)
y_true = final_test["department"].tolist()
y_pred = final_test["pred_department"].tolist()
print("Accuarcy")
print(accuracy_score(y_true, y_pred))