# Connect this google-colab notebook to google drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Import Modules

In [None]:
%%capture
!pip install transformers

In [None]:
import pandas as pd
import numpy as np

In [None]:
import plotly.express as px
import plotly.graph_objects as go

# Initialize Sentiment Analysis Model

In [None]:
import torch

# set device to GPU if available
device = torch.cuda.current_device() if torch.cuda.is_available() else "cpu"

Load the sentiment analysis model - more info from here [`https://huggingface.co/cardiffnlp/twitter-roberta-base-sentiment-latest`]

In [None]:
from transformers import (
    pipeline,
    AutoTokenizer,
    AutoModelForSequenceClassification
    )

model_id = "cardiffnlp/twitter-roberta-base-sentiment-latest"

# load the model from huggingface
model = AutoModelForSequenceClassification.from_pretrained(
    model_id,
    num_labels=3
)

# load the tokenizer from huggingface
tokenizer = AutoTokenizer.from_pretrained(model_id)

Downloading (…)lve/main/config.json:   0%|          | 0.00/929 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/501M [00:00<?, ?B/s]

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

# Sentiment Analysis function

In [None]:
test = '''
Fair salary movement for old employees. New hires with little knowledge about the product or company get higher pay.
'''

In [None]:
def sa(doc, return_all=False):
  batch = tokenizer(doc,
                    padding=True,
                    truncation=True,
                    max_length=512, # Limit to 512 tokens max
                    return_tensors="pt")

  with torch.no_grad():
    outputs = model(**batch)
    # print(outputs)
    predictions = torch.softmax(outputs.logits, dim=1)
    max_score = max(list(predictions)[0]).item()
    labels = torch.argmax(predictions, dim=1)
    # print(labels)
    labels = [model.config.id2label[label_id] for label_id in labels.tolist()]
  if return_all:
    return dict(zip(model.config.id2label.values(),
                    [i.item() for i in predictions[0]]))
  else:
    return labels[0], max_score

In [None]:
sa(test, return_all=True)

{'negative': 0.1141439825296402,
 'neutral': 0.6975476741790771,
 'positive': 0.18830831348896027}

# Load and Prepare Dataset

## Get sentiments

In [None]:
import pandas as pd

In [None]:
q = 'Q1'

In [None]:
pd.set_option('display.max_columns', None)

df = pd.read_csv('/content/drive/Shareddrives/HR x R&D Collaboration/Topic Modeling/GPulse Free-Text Responses/data/topics/bertopic_llm_'+q+'.csv')

In [None]:
round((73 / 562)*100,1)

13.0

In [None]:
df['topic'].value_counts()

-1     73
 0     72
 1     63
 2     50
 3     44
 4     29
 5     29
 7     27
 6     27
 8     24
 9     23
 10    18
 12    14
 11    14
 13    12
 14    12
 15    11
 16    11
 17     9
Name: topic, dtype: int64

In [None]:
dd = list(df['main_topic'].value_counts().values[1:])

In [None]:
ff = list(df['main_topic'].value_counts().values)

In [None]:
for i in ff:
    print(round((i/562)*100,0))

40.0
20.0
14.0
14.0


In [None]:
sum(dd) + 73

562

In [None]:
for i in dd:
    print(round((i / 562)*100,0))

13.0
11.0
9.0
8.0
5.0
5.0
5.0
5.0
4.0
4.0
3.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0


In [None]:
df[df['topic']==4]['inferred_summary'].unique()

array(['Users have given feedback that they want better customer service. They want to be able to get assistance face-to-face when they encounter problems, as well as self-service options. Users have also said that the response time to their inquiries and requests needs to be quicker. Additionally, users have reported instances of scam and fraud, and have requested that the company take action to resolve these issues. Users want to be able to contact customer service 24/7, and want the company to work on improving its customer service overall.'],
      dtype=object)

In [None]:
df[df['topic'] == 2]['Sub-topic'].unique()

array(['2_company_teams_leadership_work_processes_levels_onboarding_organization'],
      dtype=object)

In [None]:
df.topic.value_counts()

-1     73
 0     72
 1     63
 2     50
 3     44
 4     29
 5     29
 7     27
 6     27
 8     24
 9     23
 10    18
 12    14
 11    14
 13    12
 14    12
 15    11
 16    11
 17     9
Name: topic, dtype: int64

Get the sentiment score and label for each document

In [None]:
df['sa_label'], df['sa_score'] = zip(*df['for_topic_model'].map(lambda x: sa(x)))

In [None]:
df

Unnamed: 0,Category,What is the one improvement that would make the biggest difference in how we serve our customers?,for_topic_model,cleaned,cleaned_joined,length,topic,topic_probability,main_topic,Sub-topic,inferred_topic,sub-topic_name,inferred_desc,inferred_summary,sa_label,sa_score
0,Leadership,"Towards autonomous team, leadership should tru...","towards autonomous team, leadership should tru...","['autonomous', 'team', 'leadership', 'trust', ...",autonomous team leadership trust nurture self ...,22,2,0.313413,C,2_company_teams_leadership_work_processes_leve...,from the ground up,Leadership,optimization of products and services to ensur...,There are various opinions about the leadershi...,neutral,0.601622
1,Leadership,I've been here for a year. There are things do...,i've been here for a year. there are things do...,"['year', 'thing', 'lot', 'thing', 'inefficient...",year thing lot thing inefficient choose leader...,45,2,1.000000,C,2_company_teams_leadership_work_processes_leve...,from the ground up,Leadership,optimization of products and services to ensur...,There are various opinions about the leadershi...,negative,0.882785
2,Leadership,Leaders should be role models. They should be ...,leaders should be role models. they should be ...,"['leader', 'role', 'model', 'appoint', 'base',...",leader role model appoint base competence know...,35,2,1.000000,C,2_company_teams_leadership_work_processes_leve...,from the ground up,Leadership,optimization of products and services to ensur...,There are various opinions about the leadershi...,neutral,0.528839
3,Leadership,I observe some leaders are reactive in making ...,i observe some leaders are reactive in making ...,"['leader', 'reactive', 'make', 'decision', 'as...",leader reactive make decision ask context,14,2,1.000000,C,2_company_teams_leadership_work_processes_leve...,from the ground up,Leadership,optimization of products and services to ensur...,There are various opinions about the leadershi...,negative,0.744499
4,Leadership,Leaders lead by example,leaders lead by example.,"['leader', 'lead', 'example']",leader lead example,4,2,1.000000,C,2_company_teams_leadership_work_processes_leve...,from the ground up,Leadership,optimization of products and services to ensur...,There are various opinions about the leadershi...,positive,0.615857
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
557,Operating Efficiency,Preventing fraud,preventing fraud.,"['prevent', 'fraud']",prevent fraud,2,-1,0.154625,,-1_tech_company_employees_focus_marketing_agil...,,Outlier,,"In the documents, different areas of improveme...",neutral,0.673997
558,Operating Efficiency,Agility,agility.,['agility'],agility,1,-1,0.491026,,-1_tech_company_employees_focus_marketing_agil...,,Outlier,,"In the documents, different areas of improveme...",neutral,0.629514
559,Values,Good Job,good job.,"['good', 'job']",good job,2,-1,0.018338,,-1_tech_company_employees_focus_marketing_agil...,,Outlier,,"In the documents, different areas of improveme...",positive,0.924362
560,Goals & Objectives,More direction,more directions.,['direction'],direction,2,-1,0.599262,,-1_tech_company_employees_focus_marketing_agil...,,Outlier,,"In the documents, different areas of improveme...",neutral,0.822549


### Get the probability distribution of the sentiments for each document

In [None]:
df = pd.concat([df,
           (df['for_topic_model'].map(lambda x: sa(x, return_all=True))
                                             .apply(pd.Series))],
             axis=1)
df

Unnamed: 0,Category,What is the one improvement that would make the biggest difference in how we serve our customers?,for_topic_model,cleaned,cleaned_joined,length,topic,topic_probability,main_topic,Sub-topic,inferred_topic,sub-topic_name,inferred_desc,inferred_summary,sa_label,sa_score,negative,neutral,positive
0,Leadership,"Towards autonomous team, leadership should tru...","towards autonomous team, leadership should tru...","['autonomous', 'team', 'leadership', 'trust', ...",autonomous team leadership trust nurture self ...,22,2,0.313413,C,2_company_teams_leadership_work_processes_leve...,from the ground up,Leadership,optimization of products and services to ensur...,There are various opinions about the leadershi...,neutral,0.601622,0.023430,0.601622,0.374948
1,Leadership,I've been here for a year. There are things do...,i've been here for a year. there are things do...,"['year', 'thing', 'lot', 'thing', 'inefficient...",year thing lot thing inefficient choose leader...,45,2,1.000000,C,2_company_teams_leadership_work_processes_leve...,from the ground up,Leadership,optimization of products and services to ensur...,There are various opinions about the leadershi...,negative,0.882785,0.882785,0.101710,0.015505
2,Leadership,Leaders should be role models. They should be ...,leaders should be role models. they should be ...,"['leader', 'role', 'model', 'appoint', 'base',...",leader role model appoint base competence know...,35,2,1.000000,C,2_company_teams_leadership_work_processes_leve...,from the ground up,Leadership,optimization of products and services to ensur...,There are various opinions about the leadershi...,neutral,0.528839,0.067259,0.528839,0.403902
3,Leadership,I observe some leaders are reactive in making ...,i observe some leaders are reactive in making ...,"['leader', 'reactive', 'make', 'decision', 'as...",leader reactive make decision ask context,14,2,1.000000,C,2_company_teams_leadership_work_processes_leve...,from the ground up,Leadership,optimization of products and services to ensur...,There are various opinions about the leadershi...,negative,0.744499,0.744499,0.246060,0.009440
4,Leadership,Leaders lead by example,leaders lead by example.,"['leader', 'lead', 'example']",leader lead example,4,2,1.000000,C,2_company_teams_leadership_work_processes_leve...,from the ground up,Leadership,optimization of products and services to ensur...,There are various opinions about the leadershi...,positive,0.615857,0.022440,0.361703,0.615857
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
557,Operating Efficiency,Preventing fraud,preventing fraud.,"['prevent', 'fraud']",prevent fraud,2,-1,0.154625,,-1_tech_company_employees_focus_marketing_agil...,,Outlier,,"In the documents, different areas of improveme...",neutral,0.673997,0.262557,0.673997,0.063447
558,Operating Efficiency,Agility,agility.,['agility'],agility,1,-1,0.491026,,-1_tech_company_employees_focus_marketing_agil...,,Outlier,,"In the documents, different areas of improveme...",neutral,0.629514,0.305464,0.629514,0.065023
559,Values,Good Job,good job.,"['good', 'job']",good job,2,-1,0.018338,,-1_tech_company_employees_focus_marketing_agil...,,Outlier,,"In the documents, different areas of improveme...",positive,0.924362,0.013232,0.062405,0.924362
560,Goals & Objectives,More direction,more directions.,['direction'],direction,2,-1,0.599262,,-1_tech_company_employees_focus_marketing_agil...,,Outlier,,"In the documents, different areas of improveme...",neutral,0.822549,0.040827,0.822549,0.136624


Save the dataframe

* When experimenting, important to change the `path` or `name` of the file incase you have previously saved on this path

In [None]:
df.to_csv('/content/drive/Shareddrives/HR x R&D Collaboration/Topic Modeling/GPulse Free-Text Responses/data/topics/bertopic_'+q+'_sentiment.csv', index=False)

### Load the previously saved dataframe and scale them the values to `f_sa_score` column

#### Prepare scaling f_score

In [None]:
def scale(x): # It makes the range from 0 to 2
    return (x + 1) * (2 / 2)

In [None]:
df = pd.read_csv('/content/drive/Shareddrives/HR x R&D Collaboration/Topic Modeling/GPulse Free-Text Responses/data/topics/bertopic_'+q+'_sentiment.csv')

categories = ['neutral', 'positive', 'negative']

x = df.loc[:, categories+['topic']].groupby('topic').agg("mean").reset_index()
x

Unnamed: 0,topic,neutral,positive,negative
0,-1,0.554401,0.348645,0.096954
1,0,0.536033,0.300391,0.163576
2,1,0.565865,0.282773,0.151362
3,2,0.424747,0.279483,0.29577
4,3,0.422935,0.348857,0.228208
5,4,0.50408,0.256924,0.238996
6,5,0.56296,0.257393,0.179647
7,6,0.436786,0.292402,0.270812
8,7,0.540813,0.396637,0.06255
9,8,0.63017,0.240795,0.129035


In [None]:
df['f_sa_score'] = df['positive'] - df['negative']
df['f_sa_score'] = df['f_sa_score'].apply(lambda x: scale(x))
df['f_sa_score'] = df['f_sa_score'].apply(lambda x: round(x, 4))
df['Question'] = q+': '+df.columns[1]
df['q_name'] = df.columns[1]
df['q_number'] = q
df.rename(columns={df.columns[3]: 'raw'}, inplace=True)

In [None]:
df.rename(columns={'for_topic_model':'Feedback',
                            'main_topic': 'Topic',
                            'topic': 'Sub-sub-topic'}, inplace=True)

df["Topic"].fillna("X", inplace=True)
df["cleaned_joined"].fillna("-", inplace=True)

In [None]:
0.371683 - 0.019369

0.35231399999999996

In [None]:
scale(0.35231399999999996)

1.352314

In [None]:
df

Unnamed: 0,Category,What is the one improvement that would make the biggest difference in how we serve our customers?,Feedback,raw,cleaned_joined,length,Sub-sub-topic,topic_probability,Topic,Sub-topic,inferred_topic,sub-topic_name,inferred_desc,inferred_summary,sa_label,sa_score,negative,neutral,positive,f_sa_score,Question,q_name,q_number
0,Leadership,"Towards autonomous team, leadership should tru...","towards autonomous team, leadership should tru...","['autonomous', 'team', 'leadership', 'trust', ...",autonomous team leadership trust nurture self ...,22,2,0.313413,C,2_company_teams_leadership_work_processes_leve...,from the ground up,Leadership,optimization of products and services to ensur...,There are various opinions about the leadershi...,neutral,0.601622,0.023430,0.601622,0.374948,1.3515,Q1: What is the one improvement that would mak...,What is the one improvement that would make th...,Q1
1,Leadership,I've been here for a year. There are things do...,i've been here for a year. there are things do...,"['year', 'thing', 'lot', 'thing', 'inefficient...",year thing lot thing inefficient choose leader...,45,2,1.000000,C,2_company_teams_leadership_work_processes_leve...,from the ground up,Leadership,optimization of products and services to ensur...,There are various opinions about the leadershi...,negative,0.882785,0.882785,0.101710,0.015505,0.1327,Q1: What is the one improvement that would mak...,What is the one improvement that would make th...,Q1
2,Leadership,Leaders should be role models. They should be ...,leaders should be role models. they should be ...,"['leader', 'role', 'model', 'appoint', 'base',...",leader role model appoint base competence know...,35,2,1.000000,C,2_company_teams_leadership_work_processes_leve...,from the ground up,Leadership,optimization of products and services to ensur...,There are various opinions about the leadershi...,neutral,0.528839,0.067259,0.528839,0.403902,1.3366,Q1: What is the one improvement that would mak...,What is the one improvement that would make th...,Q1
3,Leadership,I observe some leaders are reactive in making ...,i observe some leaders are reactive in making ...,"['leader', 'reactive', 'make', 'decision', 'as...",leader reactive make decision ask context,14,2,1.000000,C,2_company_teams_leadership_work_processes_leve...,from the ground up,Leadership,optimization of products and services to ensur...,There are various opinions about the leadershi...,negative,0.744499,0.744499,0.246060,0.009440,0.2649,Q1: What is the one improvement that would mak...,What is the one improvement that would make th...,Q1
4,Leadership,Leaders lead by example,leaders lead by example.,"['leader', 'lead', 'example']",leader lead example,4,2,1.000000,C,2_company_teams_leadership_work_processes_leve...,from the ground up,Leadership,optimization of products and services to ensur...,There are various opinions about the leadershi...,positive,0.615857,0.022440,0.361703,0.615857,1.5934,Q1: What is the one improvement that would mak...,What is the one improvement that would make th...,Q1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
557,Operating Efficiency,Preventing fraud,preventing fraud.,"['prevent', 'fraud']",prevent fraud,2,-1,0.154625,X,-1_tech_company_employees_focus_marketing_agil...,,Outlier,,"In the documents, different areas of improveme...",neutral,0.673997,0.262557,0.673997,0.063447,0.8009,Q1: What is the one improvement that would mak...,What is the one improvement that would make th...,Q1
558,Operating Efficiency,Agility,agility.,['agility'],agility,1,-1,0.491026,X,-1_tech_company_employees_focus_marketing_agil...,,Outlier,,"In the documents, different areas of improveme...",neutral,0.629514,0.305464,0.629514,0.065023,0.7596,Q1: What is the one improvement that would mak...,What is the one improvement that would make th...,Q1
559,Values,Good Job,good job.,"['good', 'job']",good job,2,-1,0.018338,X,-1_tech_company_employees_focus_marketing_agil...,,Outlier,,"In the documents, different areas of improveme...",positive,0.924362,0.013232,0.062405,0.924362,1.9111,Q1: What is the one improvement that would mak...,What is the one improvement that would make th...,Q1
560,Goals & Objectives,More direction,more directions.,['direction'],direction,2,-1,0.599262,X,-1_tech_company_employees_focus_marketing_agil...,,Outlier,,"In the documents, different areas of improveme...",neutral,0.822549,0.040827,0.822549,0.136624,1.0958,Q1: What is the one improvement that would mak...,What is the one improvement that would make th...,Q1


# Save the final dataframe

* When experimenting, important to change the `path` or `name` of the file incase you have previously saved on this path

In [None]:
df.to_csv('/content/drive/Shareddrives/HR x R&D Collaboration/Topic Modeling/GPulse Free-Text Responses/data/topics/bertopic_'+q+'_fwithsentiment.csv', index=False)

# Append to master data base

In [None]:
from google.colab import auth
auth.authenticate_user()

import gspread
from gspread_dataframe import get_as_dataframe, set_with_dataframe
from google.auth import default
creds, _ = default()

## Initiate first Q

In [None]:
q = 'Q1'

In [None]:
df_q1 = pd.read_csv('/content/drive/Shareddrives/HR x R&D Collaboration/Topic Modeling/GPulse Free-Text Responses/data/topics/bertopic_'+q+'_fwithsentiment.csv')

In [None]:
spreadsheet_id = '1hhmR_SLG28LQv4_oJ9gEyXNxOjplgNeNEYfSGQGd1yg'

def write_to_gsheet(spreadsheet_id, sheet_name, data_df):
    """
    this function takes data_df and writes it under spreadsheet_id
    and sheet_name using your credentials under service_file_path
    """

    gc = gspread.authorize(creds)
    sh = gc.open_by_key(spreadsheet_id)
    wks_write = sh.add_worksheet(sheet_name, rows=len(data_df)*50, cols=100)

    set_with_dataframe(wks_write, data_df)

In [None]:
write_to_gsheet(spreadsheet_id, 'master_data_gpulse', df_q1)

# Append to master topic names

## Initiate first Q

In [None]:
q1 = 'Q1'

df_q1 = pd.read_csv('/content/drive/Shareddrives/HR x R&D Collaboration/Topic Modeling/GPulse Free-Text Responses/data/topics/topic_names_'+q+'.csv')

In [None]:
write_to_gsheet('14kWv4symkt_n7ny7nYkugRVq90g8tvUP1155h0YxNcc', 'master_topic_names', df_q1)

In [None]:
df_q1[df_q1['Sub-sub-topic'] == 17]['sub-topic_name']

17   NaN
Name: sub-topic_name, dtype: float64

In [None]:
df_q1

Unnamed: 0,Topic,Sub-sub-topic,Sub-topic,Question,q_name,q_number,topic_name,sub-topic_name
0,A,7,7_customers_feedback_support_urgent concerns_t...,Q1: cleaned,cleaned,Q1,,
1,A,8,8_business_inclusion_top priority_valuable imp...,Q1: cleaned,cleaned,Q1,,
2,A,10,10_services_needs_discounts_businesses_product...,Q1: cleaned,cleaned,Q1,,
3,A,16,16_loyal customers_concerns_payroll_acquisitio...,Q1: cleaned,cleaned,Q1,,
4,B,0,0_employees_salary_benefits_compensation_bonus...,Q1: cleaned,cleaned,Q1,,
5,B,5,5_balance_work_stress_workloads_headcount_week...,Q1: cleaned,cleaned,Q1,,
6,B,13,13_wfh_employees_onsite_work_office_support_un...,Q1: cleaned,cleaned,Q1,,
7,C,1,1_teams_collaboration_business_tribe_different...,Q1: cleaned,cleaned,Q1,,
8,C,2,2_company_teams_leadership_work_processes_leve...,Q1: cleaned,cleaned,Q1,,
9,C,3,3_gcash_products_customers_wallet_company_form...,Q1: cleaned,cleaned,Q1,,
