<a href="https://colab.research.google.com/github/ykim71/google_toxicity/blob/main/google_toxicity_update.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Load File via GDrive



In [None]:
"""
Run this code and it will bring your Google account access permission. 
This gives Colab direct access for any files in your Google Drive.
OR, upload file from local
"""
from google.colab import drive

drive.mount("/content/drive")

Mounted at /content/drive


In [None]:
"""
This code changes your working directory that Colab is on. I created and set 'toxicity' folder in my Google Drive. 
I can load and save data on the 'toxicity' folder in my Google Drive
"""
%cd drive/'MyDrive'/toxicity/

/content/drive/MyDrive/toxicity


## load your file to Colab

In [None]:
"""
load your file on Colab using following code. Replace 'sample_code_review.csv' with your file name. 
I set my data as 'sample_text' so you can replce it other name. 
"""

import pandas as pd

sample_text = pd.read_csv('sample_df.csv') # OR, /content/sample_code_review.csv


In [None]:
"""
take random 3 samples to see if data has loaded successfully; 'text' is the column that you want to analyze.
"""
#len(sample_text)
sample_text.sample(10)

Unnamed: 0,Unnamed: 0.1,text
1151,182990,I dont think that was the question . QUESTION:...
1328,82542,"Yes, thats basically it. I mean, you know, the..."
455,366059,And really this happens through having kind of...
581,411306,"We are of course, standing by for the start of..."
375,347725,"So listen, the Chinese Communist Party has sys..."
1091,78774,"I mean, two months ago, we were hearing about ..."
1208,105435,"On Sunday, make sure you join CNN\s Fareed Zak..."
1014,9551,"Why? 80 percent of us, who get this virus, God..."
94,272573,"And thats changing rapidly, you know that, you..."
1028,84745,"So her baby, Esther, was born. Esther was born..."


# Perpective API toxicity 



> Language Attributes: https://developers.perspectiveapi.com/s/about-the-api-attributes-and-languages



> API Request: https://developers.perspectiveapi.com/s/docs-get-started (note UT Google Account may not work; recommend using personal Google account for request)



In [None]:
"""
load packages/libraries
"""
from googleapiclient import discovery
from googleapiclient.errors import HttpError


In [None]:
"""
Enter your API here;
"""
API_KEY='your-api'


In [None]:
"""
Run this code if you want to analyze text data 4 measures of Toxicity, Likely to reject, Insult, and Identity Attact. 
See below comments for other variables and descriptions in detail.

"""
# variable descriptions: https://github.com/conversationai/perspectiveapi
# you can replace toxicity attributes here:
analyze_request = {
   'comment': { 'text': 'xx'}, # setting formats (id, text)
   'requestedAttributes': {'TOXICITY@6': {}, # see the actual variable name from the Perspective API page
                           'LIKELY_TO_REJECT@2': {}, 
                           'INSULT': {}, 
                           'IDENTITY_ATTACK': {} 
                           },
   'doNotStore': True, # for other settings, https://developers.perspectiveapi.com/s/about-the-api-methods
   'languages' : 'en'
}



In [None]:
# for a single text
import json

def incivility_measures(text):
  
  service = discovery.build(
  "commentanalyzer",
  "v1alpha1",
  developerKey=API_KEY,
  discoveryServiceUrl="https://commentanalyzer.googleapis.com/$discovery/rest?version=v1alpha1",
  static_discovery=False,)
  
  analyze_request['comment']['text'] = text
  
  response = service.comments().analyze(body=analyze_request).execute()
  i = json.loads(json.dumps(response, indent=2))
  
  toxicity = i['attributeScores']['TOXICITY@6']['summaryScore']['value']
  reject = i['attributeScores']['LIKELY_TO_REJECT@2']['summaryScore']['value']
  insult = i['attributeScores']['INSULT']['summaryScore']['value']
  identity = i['attributeScores']['IDENTITY_ATTACK']['summaryScore']['value']
  
  print("text:" + text + "\ntoxicity:" + str(toxicity) + "\nreject:" + str(reject) + "\ninsult:" + str(insult) + "\nidentity:" + str(identity))


In [None]:

text = 'To all those thinking about voting for Trump, remember...  IN YOUR HEART... YOU KNOW HE\'S SHIT'
incivility_measures(text)

text = sample_text.text[1]
incivility_measures(text)


text:To all those thinking about voting for Trump, remember...  IN YOUR HEART... YOU KNOW HE'S SHIT
toxicity:0.94727516
reject:0.99898785
insult:0.71120167
identity:0.09659086
text:And with red tape slashed, scientists, medical researchers, our doctors are now rapidly developing not only treatments, but hopefully a vaccine. This unprecedented cooperation between the government, private industry, this will not forever change the way this country and the world will deal with future pandemics and crises, just like the travel ban.
toxicity:0.072587214
reject:0.06286466
insult:0.009431887
identity:0.0027008436


In [None]:
# run this code chunk
import csv
import codecs
import json
import time
import pandas as pd

def incivility_for_chunks(sample_text_file, text):
  
  start = time.time()

  service = discovery.build(
  "commentanalyzer",
  "v1alpha1",
  developerKey=API_KEY,
  discoveryServiceUrl="https://commentanalyzer.googleapis.com/$discovery/rest?version=v1alpha1",
  static_discovery=False,)
  
  comments_toxicity_list = []
  comments_reject_list = []
  comments_insult_list = []
  comments_identity_list = []
  
  for i in text: 
    analyze_request['comment']['text'] = i
    
    try:
      response = service.comments().analyze(body=analyze_request).execute()
      i = json.loads(json.dumps(response, indent=2))
    
      comments_toxicity = i['attributeScores']['TOXICITY@6']['summaryScore']['value']
      comments_reject = i['attributeScores']['LIKELY_TO_REJECT@2']['summaryScore']['value']
      comments_insult = i['attributeScores']['INSULT']['summaryScore']['value']
      comments_identity = i['attributeScores']['IDENTITY_ATTACK']['summaryScore']['value']
        
    except HttpError:
      comments_toxicity = "error"
      comments_reject = "error"
      comments_insult = "error"
      comments_identity = "error"
      time.sleep(10.0) # added 10 second pause when error occurs
            
    comments_toxicity_list.append(comments_toxicity)
    comments_reject_list.append(comments_reject)
    comments_insult_list.append(comments_insult)
    comments_identity_list.append(comments_identity)
        
  sample_text_file = sample_text_file.join(pd.DataFrame({'toxicity': comments_toxicity_list, 
                                             'reject': comments_reject_list, 
                                             'insult': comments_insult_list, 
                                             'attack': comments_identity_list}))
  end=time.time()
  print("complete time: ", round(end -start, 2))

  return sample_text_file

In [None]:

"""
HERE is the code you need to chage. My data file name is 'sample_text' and the column name is 'text'. 
You can change your file and the text column name here.  
For other example, if your data name is 'df' and the text column name is 'comment', 
the first line of following code is supposed to be:

text = df.comment.values.tolist(): 

"""

text = sample_text.text.values.tolist()

"""
I switched the processs with a definition function for multiple running

add arguments into the definition below; (1) 'sample_text' is the datafile you have, 
'text' is the text data that you want to analyze here (we already assign text here)
For other example, if your data name is 'df' and the text column name is 'comment',

text = df.comment.values.tolist()

incivility_for_chunks(df, text)

"""
incivility_for_chunks(sample_text, text)


complete time:  1608.49


Unnamed: 0,Unnamed: 0.1,text,toxicity,reject,insult,attack
0,264876,We all know by now that many of the CDCs early...,0.250108,0.297678,0.0632,0.003885
1,246504,"And with red tape slashed, scientists, medical...",0.072587,0.062865,0.009432,0.002701
2,496072,Both of those pieces of information come toget...,0.099032,0.137282,0.00833,0.003959
3,234558,We now know these crucial and critical and unp...,0.123592,0.166649,0.031697,0.004884
4,223946,Expanding test for COVID-19 pushing the race f...,error,error,error,error
...,...,...,...,...,...,...
1494,105691,Hundreds of cases of multisymptomatic inflamma...,0.038754,0.178228,0.007874,0.004033
1495,205301,"I think the other thing is, you know, look, Wo...",0.031415,0.055915,0.026832,0.011469
1496,105736,There is a very serious question tonight about...,error,error,error,error
1497,105754,"Our breaking news tonight, more than half of s...",0.127841,0.335127,0.009679,0.005735


In [None]:
"""
take random the first 10 text to see if data has computed successfully;
"""
sample_text[0:9]

Unnamed: 0,Unnamed: 0.1,text
0,264876,We all know by now that many of the CDCs early...
1,246504,"And with red tape slashed, scientists, medical..."
2,496072,Both of those pieces of information come toget...
3,234558,We now know these crucial and critical and unp...
4,223946,Expanding test for COVID-19 pushing the race f...
5,290590,"And all you hear about is the red tape, the ho..."
6,265780,"Whatever the President has to do, to cut red t..."
7,225043,And so what the President and the bill can use...
8,246890,Ive asked the FDA to cut through the red tape ...


In [None]:
"""
check if there's error; errors can occur due to many reasons such as the API limit (in this case you may have to re-run those) 
or the Perpective API can't analyze the text because of different languages or so.

"""

sample_text[sample_text['toxicity']=="error"]

Unnamed: 0,Unnamed: 0.1,text,toxicity,reject,insult,attack
84,317561,"You may ask about the Coronavirus, which is ve...",error,error,error,error
87,226887,"Well, the Metropolitan Museum of Arts in New Y...",error,error,error,error
88,272145,What they want is people want to keep this goi...,error,error,error,error
90,265801,"Look, China is paying us billions and billions...",error,error,error,error
91,213223,"OK. In the meantime, there is a package thats ...",error,error,error,error
...,...,...,...,...,...,...
1356,210078,"All right, it\s time for your questions about ...",error,error,error,error
1370,184305,"Meantime, the conversation about how and when ...",error,error,error,error
1415,495415,Californias first suspected community spread i...,error,error,error,error
1416,211393,"Wolf, thank you for having me on. And the answ...",error,error,error,error


In [None]:
"""
I'm going to re-run those errors and merge them to those completed;
(1) save complete cases separately
(2) select error cases and remove error columns (toxicity, etc)
"""
complete_cases = sample_text[sample_text['toxicity']!="error"]
error_cases = sample_text[sample_text['toxicity']=="error"]
error_cases.drop(['toxicity','reject', 'insult','attack'], axis=1, inplace=True)


text = error_cases.text.values.tolist()

incivility_for_chunks(error_cases, text)



complete time:  136.59


In [None]:
"""
check the output again; see if the error is due to the API limit or different language issues;
if there's error again, repeat the code above;

"""
error_cases[error_cases['toxicity']=='error']

Unnamed: 0,Unnamed: 0.1,text,toxicity,reject,insult,attack
87,226887,"Well, the Metropolitan Museum of Arts in New Y...",error,error,error,error
88,272145,What they want is people want to keep this goi...,error,error,error,error
92,274336,"It is called a loan, but it is more like a gra...",error,error,error,error
93,314829,We have already dispersed $0.5 billion to stat...,error,error,error,error
95,247670,"And, Geraldo, thats insane when you say -- irr...",error,error,error,error
176,265936,We therefore recommend that Covid-19 patients ...,error,error,error,error


In [None]:
"""
(1) assign successful cases from error_cases as error_cases_r1
(2) assign error cases from r1 to r2

"""
error_cases_r1 = error_cases[error_cases['toxicity']!="error"]
error_cases_r2 = error_cases[error_cases['toxicity']=="error"]
error_cases_r2.drop(['toxicity','reject', 'insult','attack'], axis=1, inplace=True)


text = error_cases_r2.text.values.tolist()

incivility_for_chunks(error_cases_r2, text)



In [None]:
error_cases_r2[error_cases_r2['toxicity']=='error']

In [None]:
"""
merge all files

"""

sample_text_final = pd.concat([complete_cases, error_cases_r1, error_cases_r2])
sample_text_final.sort_index(inplace=True)


In [None]:
"""
Exporting the results -- Save the data to your Google Drive OR the Colab environment. 
You can also find your data at the folder icon at the left side and download it.
"""

sample_text_final.to_csv('toxicity_done.csv')

In [None]:
# """
# For large text: 
# assign your column name (that contain text data you want to analyze) in the code

# Note that there's a limit to process depending on your API; added a 5 second pause when error occurs;
# """
# import csv
# import codecs
# import json
# import time
# import pandas as pd

# # setting attributes, can add more attiributes 

# service = discovery.build(
#   "commentanalyzer",
#   "v1alpha1",
#   developerKey=API_KEY,
#   discoveryServiceUrl="https://commentanalyzer.googleapis.com/$discovery/rest?version=v1alpha1",
#   static_discovery=False,
# )

# start = time.time()

# comments_toxicity_list = []
# comments_reject_list = []
# comments_insult_list = []
# comments_identity_list = []

# """
# HERE is the code you need to chage. My data file name is 'sample_text' and the column name is 'text'. 
# You can change your file and the text column name here.  
# For other example, if your data name is 'df' and the text columne name is 'comment', 
# the first line of following code is supposed to be:

# for i in df.comment.values.tolist(): 

# Once you change this code, run this code block.
# """

# for i in sample_text.text.values.tolist(): 
#   analyze_request['comment']['text'] = i
  
#   try:
#     response = service.comments().analyze(body=analyze_request).execute()
#     i = json.loads(json.dumps(response, indent=2))
    
#     comments_toxicity = i['attributeScores']['TOXICITY@6']['summaryScore']['value']
#     comments_reject = i['attributeScores']['LIKELY_TO_REJECT@2']['summaryScore']['value']
#     comments_insult = i['attributeScores']['INSULT']['summaryScore']['value']
#     comments_identity = i['attributeScores']['IDENTITY_ATTACK']['summaryScore']['value']
        
#   except HttpError:
#     comments_toxicity = "error"
#     comments_reject = "error"
#     comments_insult = "error"
#     comments_identity = "error"
#     time.sleep(5.0) # added 5 second pause when error occurs
            
#   comments_toxicity_list.append(comments_toxicity)
#   comments_reject_list.append(comments_reject)
#   comments_insult_list.append(comments_insult)
#   comments_identity_list.append(comments_identity)
        
# sample_text = sample_text.join(pd.DataFrame({'toxicity': comments_toxicity_list, 
#                                              'reject': comments_reject_list, 
#                                              'insult': comments_insult_list, 
#                                              'attack': comments_identity_list}))

# end=time.time()
# print("complete time: ", round(end -start, 2))

complete time:  1114.99
