In [2]:
# Load required packages
import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import re
import zipfile
import nltk
import os
import string
from nltk import word_tokenize
from nltk.tokenize import sent_tokenize

from transformers import pipeline

In [2]:
classifier = pipeline("zero-shot-classification")

No model was supplied, defaulted to facebook/bart-large-mnli (https://huggingface.co/facebook/bart-large-mnli)


In [127]:
output_dir = 'C:\\Users\\yashd\\Desktop\\URAP\\Code\\Topic_text_csv\\'
content_df = pd.read_csv("merged_content.csv")

In [128]:
start_index = 504
num_interviews = len(content_df) - start_index
end_index = start_index + num_interviews
content_df = pd.read_csv("merged_content.csv")[start_index: end_index]
content_df = content_df[['text_id', 'compiled_content']].rename(columns = {'compiled_content' : 'content'})
content_df

Unnamed: 0,text_id,content
170,33-20-62-30,"Hello, how are books? Books are fair. . Okay, ..."
171,33-21-42-99,So how are you going to face the field since y...
172,33-69-32-48,Beatrice how are you? Am okay how are you? You...
173,33-83-17-66,Hello am called Brian. Am called Cotilda. How ...
174,33-91-51-37,How are you Grace? Am fine how are you? Am fin...
...,...,...
499,98-65-25-06,How are you/. Am Shamiya Nassozi. Okay sir. Th...
500,99-33-66-89,"Hello, Teddy ow are you. Am okay how are you? ..."
501,99-41-75-39,"Hello Madam. Hello, how are you? Am fine Madam..."
502,99-50-93-22,Hello winnie. Hello Madam. How are you? Am fin...


In [129]:
topics = ['information','strategy', 'motivation', 'neutral']

In [130]:
# Handles multiple labels associated with each topic
# If avg is true, compute mean similarity across topic terms and return maximum value
# Else, it finds the max value amongst topic terms for each topic, and returns max among those
def topic_classifier(txt, avg = True):
    topic_terms = [['information', 'data'], 
                   ['application', 'cv', 'CV' 'interview', 'strategy', 'job search', 'skills', 'behaviour', 'resume'],
                   ['motivation', 'determination', 'optimism', 'hope']]
    # Obtain dictionaries with the similarity scores with each of the topic related words
    topic_dict = {}
    for i in range(len(topics[:-1])):
        topic_dict[topics[i]] = classifier(txt, topic_terms[i], multi_label = True)['scores']
    
    if avg:
        return max(['information', 'strategy', 'motivation'], key = lambda x: np.mean(topic_dict[x]))
    else:
        return max(['information', 'strategy', 'motivation'], key = lambda x: max(topic_dict[x]))        

In [131]:
def ultimate_tokenize(sentence):
    # Remove punctuation and digits
    sentence = sentence.translate(str.maketrans('', '', string.punctuation + string.digits))
    return word_tokenize(sentence.lower())

In [132]:
def cleaning(interview):
    
    tokens = ultimate_tokenize(interview)
    from nltk.corpus import stopwords
    
    stops = stopwords.words('english')
    stops.extend(['yeah','hello','ye','yes','okay','ok', 'like', 'uhm','hmmmm','hmmm',
                  'eh', 'yah', 'even', 'good', 'evening', 'lovely', 'thanks', 'really', 'name', 'ayugi'])
    stops.extend('.,[,],(,),;,/,-,\',?,",:,<,>,n\'t,|,#,\'s,\",\'re,\'ve,\'ll,\'d,\'re,’'.split(','))
    stops.extend(',')
    
    # 6. Remove stop words. 
    words = [w for w in tokens if not w in stops]
    
    sent = ' '.join(words)
    return sent

In [133]:
# Returns dictionary with topics and similarity scores in relation to TXT
def similarity_to_topics(txt):
    topic_dict = classifier(txt, topics, multi_label = True)
    paired_dict = {topic_dict['labels'][i] : topic_dict['scores'][i] for i in range(len(topics))}
    return paired_dict

In [134]:
%%time
# Create an array of topic similarity dictionaries and obtain the most similar topic from them and store in two arrays
for i in range(start_index, end_index):
    
    print("Processing Interview", i, "out of", end_index, "with text_id: ",content_df['text_id'][i])
    interview_sent = sent_tokenize(re.sub('(shs)', ' dollars', content_df['content'][i]))
    num_sent = len(interview_sent)
    interview_df = pd.DataFrame({'text_id' : np.repeat(content_df['text_id'][i], num_sent)})
    interview_df['content'] = np.array(interview_sent)
    most_similar_topic_array = np.array([])
    
    for j in range(num_sent):
        if (len(cleaning(interview_df['content'][j])) < 10):
            most_similar_topic_array = np.append(most_similar_topic_array, 'neutral')
        else: 
            similar_topic = topic_classifier(interview_df['content'][j], False)
            most_similar_topic_array = np.append(most_similar_topic_array, similar_topic)

    interview_df['topic'] = most_similar_topic_array    
    save_name = content_df['text_id'][i] + ".csv"
    interview_df.to_csv(output_dir + save_name, index = False)

Processing Interview 170 out of 334 with text_id:  33-20-62-30
Processing Interview 171 out of 334 with text_id:  33-21-42-99
Processing Interview 172 out of 334 with text_id:  33-69-32-48
Processing Interview 173 out of 334 with text_id:  33-83-17-66
Processing Interview 174 out of 334 with text_id:  33-91-51-37
Processing Interview 175 out of 334 with text_id:  36-54-28-69
Processing Interview 176 out of 334 with text_id:  36-81-75-91
Processing Interview 177 out of 334 with text_id:  37-21-70-94
Processing Interview 178 out of 334 with text_id:  37-52-72-08
Processing Interview 179 out of 334 with text_id:  37-91-62-46
Processing Interview 180 out of 334 with text_id:  38-01-31-63
Processing Interview 181 out of 334 with text_id:  38-10-90-25
Processing Interview 182 out of 334 with text_id:  38-12-17-88
Processing Interview 183 out of 334 with text_id:  38-40-59-75
Processing Interview 184 out of 334 with text_id:  39-25-69-54
Processing Interview 185 out of 334 with text_id:  40-1

Processing Interview 301 out of 334 with text_id:  61-09-52-60
Processing Interview 302 out of 334 with text_id:  61-28-35-56
Processing Interview 303 out of 334 with text_id:  61-35-75-36
Processing Interview 304 out of 334 with text_id:  61-55-75-84
Processing Interview 305 out of 334 with text_id:  61-59-98-62
Processing Interview 306 out of 334 with text_id:  61-98-97-76
Processing Interview 307 out of 334 with text_id:  62-15-83-01
Processing Interview 308 out of 334 with text_id:  62-16-88-48
Processing Interview 309 out of 334 with text_id:  62-20-22-64
Processing Interview 310 out of 334 with text_id:  62-33-10-15
Processing Interview 311 out of 334 with text_id:  62-43-74-50
Processing Interview 312 out of 334 with text_id:  62-78-18-79
Processing Interview 313 out of 334 with text_id:  62-84-78-25
Processing Interview 314 out of 334 with text_id:  62-90-15-21
Processing Interview 315 out of 334 with text_id:  63-19-99-89
Processing Interview 316 out of 334 with text_id:  63-2

Processing Interview 432 out of 334 with text_id:  83-28-82-19
Processing Interview 433 out of 334 with text_id:  83-33-92-10
Processing Interview 434 out of 334 with text_id:  83-81-94-08
Processing Interview 435 out of 334 with text_id:  83-93-53-14
Processing Interview 436 out of 334 with text_id:  83-99-49-64
Processing Interview 437 out of 334 with text_id:  84-01-59-14
Processing Interview 438 out of 334 with text_id:  84-17-89-53
Processing Interview 439 out of 334 with text_id:  84-91-04-70
Processing Interview 440 out of 334 with text_id:  85-03-35-65
Processing Interview 441 out of 334 with text_id:  85-13-45-53
Processing Interview 442 out of 334 with text_id:  85-62-79-14
Processing Interview 443 out of 334 with text_id:  85-96-36-37
Processing Interview 444 out of 334 with text_id:  86-07-82-51
Processing Interview 445 out of 334 with text_id:  86-53-61-40
Processing Interview 446 out of 334 with text_id:  86-68-29-97
Processing Interview 447 out of 334 with text_id:  86-8