In [1]:
# Load required packages
import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

from transformers import pipeline

In [2]:
classifier = pipeline("zero-shot-classification")

No model was supplied, defaulted to facebook/bart-large-mnli (https://huggingface.co/facebook/bart-large-mnli)


In [4]:
cd

C:\Users\yashd


In [294]:
# Loading up test cases
strategy_info_test = pd.read_csv("Desktop/URAP/Strategy_Info_Test.csv")
strategy_info_test

Unnamed: 0,content,Topic
0,the unemployment rate is 38%,Information
1,here is advice on how to get a higher salary,Strategy
2,most companies don’t want to use students they...,Strategy
3,if you get an internship offer but the pay at ...,Information
4,builders tend to use vulgar language so try as...,Information
5,employers are not hiring workers at present,Information
6,You have to find deduct your capital and see ...,Information
7,You get connections through you friends becaus...,Strategy
8,Like how to write a CV because when you apply ...,Strategy
9,That’s being impatient and how do you talk to ...,Strategy


In [295]:
topics = ['information','strategy', 'motivation', 'neutral']

In [296]:
# Returns dictionary with topics and similarity scores in relation to TXT
def similarity_to_topics(txt):
    topic_dict = classifier(txt, topics, multi_label = True)
    paired_dict = {topic_dict['labels'][i] : topic_dict['scores'][i] for i in range(len(topics))}
    return paired_dict

In [297]:
def similarity_to_topic_terms(txt):
    topic_scores = []
    topic_dict = {}
    i = 0
    for topic_words in topic_terms:
        topic_classifier = classifier(txt, topic_words, multi_label = True)
        topic_dict[topics[i]] = np.mean(topic_classifier['scores'])
        i = i + 1
    return topic_dict

In [298]:
# Handles multiple labels associated with each topic
# If avg is true, compute mean similarity across topic labels and return maximum value
# Else, it finds the max value amongst topic labels for each topic, and returns max among those
def strategy_info_classify(txt, avg = True):
    topic_terms = [['information', 'data'], 
                   ['application', 'cv', 'CV' 'interview', 'strategy', 'job search', 'skills', 'behaviour', 'resume'],
                   ['motivation', 'determination', 'optimism', 'hope']]
    # Obtain dictionaries with the similarity scores with each of the topic related words
    topic_dict = {}
    for i in range(len(topics[:-1])):
        topic_dict[topics[i]] = classifier(txt, topic_terms[i], multi_label = True)['scores']
    
    
    if avg:
        return max(['information', 'strategy', 'motivation'], key = lambda x: np.mean(topic_dict[x]))
    else:
        return max(['information', 'strategy', 'motivation'], key = lambda x: max(topic_dict[x]))        

In [299]:
%%time
# Create an array of topic similarity dictionaries and obtain the most similar topic from them and store in two arrays
topic_dict_array = np.array([])
most_similar_topic_array = np.array([])
for i in range(len(strategy_info_test)):
    topic_dict = similarity_to_topics(strategy_info_test['content'][i])
    topic_dict_array = np.append(topic_dict_array, topic_dict)
    similar_topic = sorted(topic_dict.items(), key=lambda x: x[1], reverse=True)[0][0]
    
    # To differentiate between strategy and information better, additional function with topic-related words
    if (similar_topic == 'strategy' or similar_topic == 'information'): # and abs(topic_dict['strategy'] - topic_dict['information']) > 0.20
        similar_topic = strategy_info_classify(strategy_info_test['content'][i], False)
    most_similar_topic_array = np.append(most_similar_topic_array, similar_topic)

# Add those two arrays as columns
strategy_info_test['predicted_topic'] = most_similar_topic_array    
strategy_info_test['topic_dict'] = topic_dict_array
  
# Extract the topic-specific scores and store them into their respective column in the dataframe
for i in range(len(topics)):
    strategy_info_test[topics[i]] = strategy_info_test['topic_dict'].apply(lambda x: x[topics[i]])
strategy_info_test

Wall time: 2min 1s


Unnamed: 0,content,Topic,predicted_topic,topic_dict,information,strategy,motivation,neutral
0,the unemployment rate is 38%,Information,information,"{'information': 0.8781885504722595, 'strategy'...",0.878189,0.03611,0.0179,0.001567
1,here is advice on how to get a higher salary,Strategy,information,"{'information': 0.9900729060173035, 'strategy'...",0.990073,0.957627,0.337709,0.000831
2,most companies don’t want to use students they...,Strategy,strategy,"{'strategy': 0.8940097093582153, 'information'...",0.788155,0.89401,0.366814,0.024042
3,if you get an internship offer but the pay at ...,Information,information,"{'information': 0.8243071436882019, 'strategy'...",0.824307,0.164881,0.091134,0.005612
4,builders tend to use vulgar language so try as...,Information,strategy,"{'strategy': 0.4126897156238556, 'information'...",0.145049,0.41269,0.114811,0.001173
5,employers are not hiring workers at present,Information,strategy,"{'information': 0.6336061358451843, 'strategy'...",0.633606,0.617034,0.225548,0.072824
6,You have to find deduct your capital and see ...,Information,information,"{'information': 0.9414877891540527, 'strategy'...",0.941488,0.286889,0.106728,0.005053
7,You get connections through you friends becaus...,Strategy,strategy,"{'strategy': 0.9697263240814209, 'information'...",0.91691,0.969726,0.8399,0.01352
8,Like how to write a CV because when you apply ...,Strategy,strategy,"{'information': 0.9749327898025513, 'strategy'...",0.974933,0.033387,0.018218,0.008063
9,That’s being impatient and how do you talk to ...,Strategy,strategy,"{'strategy': 0.04517253860831261, 'information...",0.004795,0.045173,0.002029,0.000371


In [300]:
# Compute the accuracy of the classification process
accurate = np.count_nonzero(strategy_info_test['Topic'].apply(lambda x: x.lower()) == strategy_info_test['predicted_topic'])
print("Accuracy:", accurate, "out of", len(strategy_info_test), "=", np.round(accurate/len(strategy_info_test), 4) * 100, "%" )

Accuracy: 22 out of 34 = 64.71000000000001 %


In [301]:
# Obtain actual and expected counts for labelled sentences
actual_count_dict = {topic : 0 for topic in topics}
expected_count_dict = {topic : 0 for topic in topics}
for i in range(len(strategy_info_test)):
    actual_count_dict[strategy_info_test['predicted_topic'][i]] += 1
    expected_count_dict[strategy_info_test['Topic'][i].lower()] += 1

In [302]:
summary_df = pd.DataFrame([expected_count_dict, actual_count_dict])
summary_df.index = ['Expected Count', 'Actual Count']
summary_df

Unnamed: 0,information,strategy,motivation,neutral
Expected Count,12,16,6,0
Actual Count,10,19,5,0
